]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/modes/asm/ghash-sparcv9.pl
0365e0f1ff429e8e5250e86a627616fb58cbbf3f
[thirdparty/openssl.git] / crypto / modes / asm / ghash-sparcv9.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # March 2010
11 #
12 # The module implements "4-bit" GCM GHASH function and underlying
13 # single multiplication operation in GF(2^128). "4-bit" means that it
14 # uses 256 bytes per-key table [+128 bytes shared table]. Performance
15 # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
16 # and are expressed in cycles per processed byte, less is better:
17 #
18 # gcc 3.3.x cc 5.2 this assembler
19 #
20 # 32-bit build 81.4 43.3 12.6 (+546%/+244%)
21 # 64-bit build 20.2 21.2 12.6 (+60%/+68%)
22 #
23 # Here is data collected on UltraSPARC T1 system running Linux:
24 #
25 # gcc 4.4.1 this assembler
26 #
27 # 32-bit build 566 50 (+1000%)
28 # 64-bit build 56 50 (+12%)
29 #
30 # I don't quite understand why difference between 32-bit and 64-bit
31 # compiler-generated code is so big. Compilers *were* instructed to
32 # generate code for UltraSPARC and should have used 64-bit registers
33 # for Z vector (see C code) even in 32-bit build... Oh well, it only
34 # means more impressive improvement coefficients for this assembler
35 # module;-) Loops are aggressively modulo-scheduled in respect to
36 # references to input data and Z.hi updates to achieve 12 cycles
37 # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
38 # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
39 #
40 # October 2012
41 #
42 # Add VIS3 lookup-table-free implementation using polynomial
43 # multiplication xmulx[hi] and extended addition addxc[cc]
44 # instructions. 4.52/7.63x improvement on T3/T4 or in absolute
45 # terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
46 # saturates at ~15.5x single-process result on 8-core processor,
47 # or ~20.5GBps per 2.85GHz socket.
48
49 $bits=32;
50 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
51 if ($bits==64) { $bias=2047; $frame=192; }
52 else { $bias=0; $frame=112; }
53
54 $output=shift;
55 open STDOUT,">$output";
56
57 $Zhi="%o0"; # 64-bit values
58 $Zlo="%o1";
59 $Thi="%o2";
60 $Tlo="%o3";
61 $rem="%o4";
62 $tmp="%o5";
63
64 $nhi="%l0"; # small values and pointers
65 $nlo="%l1";
66 $xi0="%l2";
67 $xi1="%l3";
68 $rem_4bit="%l4";
69 $remi="%l5";
70 $Htblo="%l6";
71 $cnt="%l7";
72
73 $Xi="%i0"; # input argument block
74 $Htbl="%i1";
75 $inp="%i2";
76 $len="%i3";
77
78 $code.=<<___ if ($bits==64);
79 .register %g2,#scratch
80 .register %g3,#scratch
81 ___
82 $code.=<<___;
83 .section ".text",#alloc,#execinstr
84
85 .align 64
86 rem_4bit:
87 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
88 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
89 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
90 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
91 .type rem_4bit,#object
92 .size rem_4bit,(.-rem_4bit)
93
94 .globl gcm_ghash_4bit
95 .align 32
96 gcm_ghash_4bit:
97 save %sp,-$frame,%sp
98 ldub [$inp+15],$nlo
99 ldub [$Xi+15],$xi0
100 ldub [$Xi+14],$xi1
101 add $len,$inp,$len
102 add $Htbl,8,$Htblo
103
104 1: call .+8
105 add %o7,rem_4bit-1b,$rem_4bit
106
107 .Louter:
108 xor $xi0,$nlo,$nlo
109 and $nlo,0xf0,$nhi
110 and $nlo,0x0f,$nlo
111 sll $nlo,4,$nlo
112 ldx [$Htblo+$nlo],$Zlo
113 ldx [$Htbl+$nlo],$Zhi
114
115 ldub [$inp+14],$nlo
116
117 ldx [$Htblo+$nhi],$Tlo
118 and $Zlo,0xf,$remi
119 ldx [$Htbl+$nhi],$Thi
120 sll $remi,3,$remi
121 ldx [$rem_4bit+$remi],$rem
122 srlx $Zlo,4,$Zlo
123 mov 13,$cnt
124 sllx $Zhi,60,$tmp
125 xor $Tlo,$Zlo,$Zlo
126 srlx $Zhi,4,$Zhi
127 xor $Zlo,$tmp,$Zlo
128
129 xor $xi1,$nlo,$nlo
130 and $Zlo,0xf,$remi
131 and $nlo,0xf0,$nhi
132 and $nlo,0x0f,$nlo
133 ba .Lghash_inner
134 sll $nlo,4,$nlo
135 .align 32
136 .Lghash_inner:
137 ldx [$Htblo+$nlo],$Tlo
138 sll $remi,3,$remi
139 xor $Thi,$Zhi,$Zhi
140 ldx [$Htbl+$nlo],$Thi
141 srlx $Zlo,4,$Zlo
142 xor $rem,$Zhi,$Zhi
143 ldx [$rem_4bit+$remi],$rem
144 sllx $Zhi,60,$tmp
145 xor $Tlo,$Zlo,$Zlo
146 ldub [$inp+$cnt],$nlo
147 srlx $Zhi,4,$Zhi
148 xor $Zlo,$tmp,$Zlo
149 ldub [$Xi+$cnt],$xi1
150 xor $Thi,$Zhi,$Zhi
151 and $Zlo,0xf,$remi
152
153 ldx [$Htblo+$nhi],$Tlo
154 sll $remi,3,$remi
155 xor $rem,$Zhi,$Zhi
156 ldx [$Htbl+$nhi],$Thi
157 srlx $Zlo,4,$Zlo
158 ldx [$rem_4bit+$remi],$rem
159 sllx $Zhi,60,$tmp
160 xor $xi1,$nlo,$nlo
161 srlx $Zhi,4,$Zhi
162 and $nlo,0xf0,$nhi
163 addcc $cnt,-1,$cnt
164 xor $Zlo,$tmp,$Zlo
165 and $nlo,0x0f,$nlo
166 xor $Tlo,$Zlo,$Zlo
167 sll $nlo,4,$nlo
168 blu .Lghash_inner
169 and $Zlo,0xf,$remi
170
171 ldx [$Htblo+$nlo],$Tlo
172 sll $remi,3,$remi
173 xor $Thi,$Zhi,$Zhi
174 ldx [$Htbl+$nlo],$Thi
175 srlx $Zlo,4,$Zlo
176 xor $rem,$Zhi,$Zhi
177 ldx [$rem_4bit+$remi],$rem
178 sllx $Zhi,60,$tmp
179 xor $Tlo,$Zlo,$Zlo
180 srlx $Zhi,4,$Zhi
181 xor $Zlo,$tmp,$Zlo
182 xor $Thi,$Zhi,$Zhi
183
184 add $inp,16,$inp
185 cmp $inp,$len
186 be,pn `$bits==64?"%xcc":"%icc"`,.Ldone
187 and $Zlo,0xf,$remi
188
189 ldx [$Htblo+$nhi],$Tlo
190 sll $remi,3,$remi
191 xor $rem,$Zhi,$Zhi
192 ldx [$Htbl+$nhi],$Thi
193 srlx $Zlo,4,$Zlo
194 ldx [$rem_4bit+$remi],$rem
195 sllx $Zhi,60,$tmp
196 xor $Tlo,$Zlo,$Zlo
197 ldub [$inp+15],$nlo
198 srlx $Zhi,4,$Zhi
199 xor $Zlo,$tmp,$Zlo
200 xor $Thi,$Zhi,$Zhi
201 stx $Zlo,[$Xi+8]
202 xor $rem,$Zhi,$Zhi
203 stx $Zhi,[$Xi]
204 srl $Zlo,8,$xi1
205 and $Zlo,0xff,$xi0
206 ba .Louter
207 and $xi1,0xff,$xi1
208 .align 32
209 .Ldone:
210 ldx [$Htblo+$nhi],$Tlo
211 sll $remi,3,$remi
212 xor $rem,$Zhi,$Zhi
213 ldx [$Htbl+$nhi],$Thi
214 srlx $Zlo,4,$Zlo
215 ldx [$rem_4bit+$remi],$rem
216 sllx $Zhi,60,$tmp
217 xor $Tlo,$Zlo,$Zlo
218 srlx $Zhi,4,$Zhi
219 xor $Zlo,$tmp,$Zlo
220 xor $Thi,$Zhi,$Zhi
221 stx $Zlo,[$Xi+8]
222 xor $rem,$Zhi,$Zhi
223 stx $Zhi,[$Xi]
224
225 ret
226 restore
227 .type gcm_ghash_4bit,#function
228 .size gcm_ghash_4bit,(.-gcm_ghash_4bit)
229 ___
230
231 undef $inp;
232 undef $len;
233
234 $code.=<<___;
235 .globl gcm_gmult_4bit
236 .align 32
237 gcm_gmult_4bit:
238 save %sp,-$frame,%sp
239 ldub [$Xi+15],$nlo
240 add $Htbl,8,$Htblo
241
242 1: call .+8
243 add %o7,rem_4bit-1b,$rem_4bit
244
245 and $nlo,0xf0,$nhi
246 and $nlo,0x0f,$nlo
247 sll $nlo,4,$nlo
248 ldx [$Htblo+$nlo],$Zlo
249 ldx [$Htbl+$nlo],$Zhi
250
251 ldub [$Xi+14],$nlo
252
253 ldx [$Htblo+$nhi],$Tlo
254 and $Zlo,0xf,$remi
255 ldx [$Htbl+$nhi],$Thi
256 sll $remi,3,$remi
257 ldx [$rem_4bit+$remi],$rem
258 srlx $Zlo,4,$Zlo
259 mov 13,$cnt
260 sllx $Zhi,60,$tmp
261 xor $Tlo,$Zlo,$Zlo
262 srlx $Zhi,4,$Zhi
263 xor $Zlo,$tmp,$Zlo
264
265 and $Zlo,0xf,$remi
266 and $nlo,0xf0,$nhi
267 and $nlo,0x0f,$nlo
268 ba .Lgmult_inner
269 sll $nlo,4,$nlo
270 .align 32
271 .Lgmult_inner:
272 ldx [$Htblo+$nlo],$Tlo
273 sll $remi,3,$remi
274 xor $Thi,$Zhi,$Zhi
275 ldx [$Htbl+$nlo],$Thi
276 srlx $Zlo,4,$Zlo
277 xor $rem,$Zhi,$Zhi
278 ldx [$rem_4bit+$remi],$rem
279 sllx $Zhi,60,$tmp
280 xor $Tlo,$Zlo,$Zlo
281 ldub [$Xi+$cnt],$nlo
282 srlx $Zhi,4,$Zhi
283 xor $Zlo,$tmp,$Zlo
284 xor $Thi,$Zhi,$Zhi
285 and $Zlo,0xf,$remi
286
287 ldx [$Htblo+$nhi],$Tlo
288 sll $remi,3,$remi
289 xor $rem,$Zhi,$Zhi
290 ldx [$Htbl+$nhi],$Thi
291 srlx $Zlo,4,$Zlo
292 ldx [$rem_4bit+$remi],$rem
293 sllx $Zhi,60,$tmp
294 srlx $Zhi,4,$Zhi
295 and $nlo,0xf0,$nhi
296 addcc $cnt,-1,$cnt
297 xor $Zlo,$tmp,$Zlo
298 and $nlo,0x0f,$nlo
299 xor $Tlo,$Zlo,$Zlo
300 sll $nlo,4,$nlo
301 blu .Lgmult_inner
302 and $Zlo,0xf,$remi
303
304 ldx [$Htblo+$nlo],$Tlo
305 sll $remi,3,$remi
306 xor $Thi,$Zhi,$Zhi
307 ldx [$Htbl+$nlo],$Thi
308 srlx $Zlo,4,$Zlo
309 xor $rem,$Zhi,$Zhi
310 ldx [$rem_4bit+$remi],$rem
311 sllx $Zhi,60,$tmp
312 xor $Tlo,$Zlo,$Zlo
313 srlx $Zhi,4,$Zhi
314 xor $Zlo,$tmp,$Zlo
315 xor $Thi,$Zhi,$Zhi
316 and $Zlo,0xf,$remi
317
318 ldx [$Htblo+$nhi],$Tlo
319 sll $remi,3,$remi
320 xor $rem,$Zhi,$Zhi
321 ldx [$Htbl+$nhi],$Thi
322 srlx $Zlo,4,$Zlo
323 ldx [$rem_4bit+$remi],$rem
324 sllx $Zhi,60,$tmp
325 xor $Tlo,$Zlo,$Zlo
326 srlx $Zhi,4,$Zhi
327 xor $Zlo,$tmp,$Zlo
328 xor $Thi,$Zhi,$Zhi
329 stx $Zlo,[$Xi+8]
330 xor $rem,$Zhi,$Zhi
331 stx $Zhi,[$Xi]
332
333 ret
334 restore
335 .type gcm_gmult_4bit,#function
336 .size gcm_gmult_4bit,(.-gcm_gmult_4bit)
337 ___
338 \f
339 {{{
340 # Straightforward 128x128-bit multiplication using Karatsuba algorithm
341 # followed by pair of 64-bit reductions [with a shortcut in first one,
342 # which allowed to break dependency between reductions and remove one
343 # multiplication from critical path]. While it might be suboptimal
344 # with regard to sheer number of multiplications, other methods [such
345 # as aggregate reduction] would require more 64-bit registers, which
346 # we don't have in 32-bit application context.
347
348 ($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
349
350 ($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
351 (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
352
353 ($shl,$shr)=map("%l$_",(0..7));
354
355 # For details regarding "twisted H" see ghash-x86.pl.
356 $code.=<<___;
357 .globl gcm_init_vis3
358 .align 32
359 gcm_init_vis3:
360 save %sp,-$frame,%sp
361
362 ldx [%i1+0],$Hhi
363 ldx [%i1+8],$Hlo
364 mov 0xE1,$Xhi
365 mov 1,$Xlo
366 sllx $Xhi,57,$Xhi
367 srax $Hhi,63,$C0 ! broadcast carry
368 addcc $Hlo,$Hlo,$Hlo ! H<<=1
369 addxc $Hhi,$Hhi,$Hhi
370 and $C0,$Xlo,$Xlo
371 and $C0,$Xhi,$Xhi
372 xor $Xlo,$Hlo,$Hlo
373 xor $Xhi,$Hhi,$Hhi
374 stx $Hlo,[%i0+8] ! save twisted H
375 stx $Hhi,[%i0+0]
376
377 sethi %hi(0xA0406080),$V
378 sethi %hi(0x20C0E000),%l0
379 or $V,%lo(0xA0406080),$V
380 or %l0,%lo(0x20C0E000),%l0
381 sllx $V,32,$V
382 or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
383 stx $V,[%i0+16]
384
385 ret
386 restore
387 .type gcm_init_vis3,#function
388 .size gcm_init_vis3,.-gcm_init_vis3
389
390 .globl gcm_gmult_vis3
391 .align 32
392 gcm_gmult_vis3:
393 save %sp,-$frame,%sp
394
395 ldx [$Xip+8],$Xlo ! load Xi
396 ldx [$Xip+0],$Xhi
397 ldx [$Htable+8],$Hlo ! load twisted H
398 ldx [$Htable+0],$Hhi
399
400 mov 0xE1,%l7
401 sllx %l7,57,$xE1 ! 57 is not a typo
402 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
403
404 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
405 xmulx $Xlo,$Hlo,$C0
406 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
407 xmulx $C2,$Hhl,$C1
408 xmulxhi $Xlo,$Hlo,$Xlo
409 xmulxhi $C2,$Hhl,$C2
410 xmulxhi $Xhi,$Hhi,$C3
411 xmulx $Xhi,$Hhi,$Xhi
412
413 sll $C0,3,$sqr
414 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
415 xor $C0,$sqr,$sqr
416 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
417
418 xor $C0,$C1,$C1 ! Karatsuba post-processing
419 xor $Xlo,$C2,$C2
420 xor $sqr,$Xlo,$Xlo ! real destination is $C1
421 xor $C3,$C2,$C2
422 xor $Xlo,$C1,$C1
423 xor $Xhi,$C2,$C2
424 xor $Xhi,$C1,$C1
425
426 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
427 xor $C0,$C2,$C2
428 xmulx $C1,$xE1,$C0
429 xor $C1,$C3,$C3
430 xmulxhi $C1,$xE1,$C1
431
432 xor $Xlo,$C2,$C2
433 xor $C0,$C2,$C2
434 xor $C1,$C3,$C3
435
436 stx $C2,[$Xip+8] ! save Xi
437 stx $C3,[$Xip+0]
438
439 ret
440 restore
441 .type gcm_gmult_vis3,#function
442 .size gcm_gmult_vis3,.-gcm_gmult_vis3
443
444 .globl gcm_ghash_vis3
445 .align 32
446 gcm_ghash_vis3:
447 save %sp,-$frame,%sp
448
449 ldx [$Xip+8],$C2 ! load Xi
450 ldx [$Xip+0],$C3
451 ldx [$Htable+8],$Hlo ! load twisted H
452 ldx [$Htable+0],$Hhi
453
454 mov 0xE1,%l7
455 sllx %l7,57,$xE1 ! 57 is not a typo
456 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
457
458 and $inp,7,$shl
459 andn $inp,7,$inp
460 sll $shl,3,$shl
461 prefetch [$inp+63], 20
462 sub %g0,$shl,$shr
463
464 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
465 .Loop:
466 ldx [$inp+8],$Xlo
467 brz,pt $shl,1f
468 ldx [$inp+0],$Xhi
469
470 ldx [$inp+16],$C1 ! align data
471 srlx $Xlo,$shr,$C0
472 sllx $Xlo,$shl,$Xlo
473 sllx $Xhi,$shl,$Xhi
474 srlx $C1,$shr,$C1
475 or $C0,$Xhi,$Xhi
476 or $C1,$Xlo,$Xlo
477 1:
478 add $inp,16,$inp
479 sub $len,16,$len
480 xor $C2,$Xlo,$Xlo
481 xor $C3,$Xhi,$Xhi
482 prefetch [$inp+63], 20
483
484 xmulx $Xlo,$Hlo,$C0
485 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
486 xmulx $C2,$Hhl,$C1
487 xmulxhi $Xlo,$Hlo,$Xlo
488 xmulxhi $C2,$Hhl,$C2
489 xmulxhi $Xhi,$Hhi,$C3
490 xmulx $Xhi,$Hhi,$Xhi
491
492 sll $C0,3,$sqr
493 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
494 xor $C0,$sqr,$sqr
495 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
496
497 xor $C0,$C1,$C1 ! Karatsuba post-processing
498 xor $Xlo,$C2,$C2
499 xor $sqr,$Xlo,$Xlo ! real destination is $C1
500 xor $C3,$C2,$C2
501 xor $Xlo,$C1,$C1
502 xor $Xhi,$C2,$C2
503 xor $Xhi,$C1,$C1
504
505 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
506 xor $C0,$C2,$C2
507 xmulx $C1,$xE1,$C0
508 xor $C1,$C3,$C3
509 xmulxhi $C1,$xE1,$C1
510
511 xor $Xlo,$C2,$C2
512 xor $C0,$C2,$C2
513 brnz,pt $len,.Loop
514 xor $C1,$C3,$C3
515
516 stx $C2,[$Xip+8] ! save Xi
517 stx $C3,[$Xip+0]
518
519 ret
520 restore
521 .type gcm_ghash_vis3,#function
522 .size gcm_ghash_vis3,.-gcm_ghash_vis3
523 ___
524 }}}
525 $code.=<<___;
526 .asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
527 .align 4
528 ___
529
530 \f
531 # Purpose of these subroutines is to explicitly encode VIS instructions,
532 # so that one can compile the module without having to specify VIS
533 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
534 # Idea is to reserve for option to produce "universal" binary and let
535 # programmer detect if current CPU is VIS capable at run-time.
536 sub unvis3 {
537 my ($mnemonic,$rs1,$rs2,$rd)=@_;
538 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
539 my ($ref,$opf);
540 my %visopf = ( "addxc" => 0x011,
541 "addxccc" => 0x013,
542 "xmulx" => 0x115,
543 "xmulxhi" => 0x116 );
544
545 $ref = "$mnemonic\t$rs1,$rs2,$rd";
546
547 if ($opf=$visopf{$mnemonic}) {
548 foreach ($rs1,$rs2,$rd) {
549 return $ref if (!/%([goli])([0-9])/);
550 $_=$bias{$1}+$2;
551 }
552
553 return sprintf ".word\t0x%08x !%s",
554 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
555 $ref;
556 } else {
557 return $ref;
558 }
559 }
560
561 foreach (split("\n",$code)) {
562 s/\`([^\`]*)\`/eval $1/ge;
563
564 s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
565 &unvis3($1,$2,$3,$4)
566 /ge;
567
568 print $_,"\n";
569 }
570
571 close STDOUT;