]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/modes/asm/ghash-sparcv9.pl
b8bfa815e72e52b5eb2d746e4df9ac43d147823b
[thirdparty/openssl.git] / crypto / modes / asm / ghash-sparcv9.pl
1 #! /usr/bin/env perl
2 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # March 2010
18 #
19 # The module implements "4-bit" GCM GHASH function and underlying
20 # single multiplication operation in GF(2^128). "4-bit" means that it
21 # uses 256 bytes per-key table [+128 bytes shared table]. Performance
22 # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
23 # and are expressed in cycles per processed byte, less is better:
24 #
25 # gcc 3.3.x cc 5.2 this assembler
26 #
27 # 32-bit build 81.4 43.3 12.6 (+546%/+244%)
28 # 64-bit build 20.2 21.2 12.6 (+60%/+68%)
29 #
30 # Here is data collected on UltraSPARC T1 system running Linux:
31 #
32 # gcc 4.4.1 this assembler
33 #
34 # 32-bit build 566 50 (+1000%)
35 # 64-bit build 56 50 (+12%)
36 #
37 # I don't quite understand why difference between 32-bit and 64-bit
38 # compiler-generated code is so big. Compilers *were* instructed to
39 # generate code for UltraSPARC and should have used 64-bit registers
40 # for Z vector (see C code) even in 32-bit build... Oh well, it only
41 # means more impressive improvement coefficients for this assembler
42 # module;-) Loops are aggressively modulo-scheduled in respect to
43 # references to input data and Z.hi updates to achieve 12 cycles
44 # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
45 # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
46 #
47 # October 2012
48 #
49 # Add VIS3 lookup-table-free implementation using polynomial
50 # multiplication xmulx[hi] and extended addition addxc[cc]
51 # instructions. 4.52/7.63x improvement on T3/T4 or in absolute
52 # terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
53 # saturates at ~15.5x single-process result on 8-core processor,
54 # or ~20.5GBps per 2.85GHz socket.
55
56 $output=pop and open STDOUT,">$output";
57
58 $frame="STACK_FRAME";
59 $bias="STACK_BIAS";
60
61 $Zhi="%o0"; # 64-bit values
62 $Zlo="%o1";
63 $Thi="%o2";
64 $Tlo="%o3";
65 $rem="%o4";
66 $tmp="%o5";
67
68 $nhi="%l0"; # small values and pointers
69 $nlo="%l1";
70 $xi0="%l2";
71 $xi1="%l3";
72 $rem_4bit="%l4";
73 $remi="%l5";
74 $Htblo="%l6";
75 $cnt="%l7";
76
77 $Xi="%i0"; # input argument block
78 $Htbl="%i1";
79 $inp="%i2";
80 $len="%i3";
81
82 $code.=<<___;
83 #include "sparc_arch.h"
84
85 #ifdef __arch64__
86 .register %g2,#scratch
87 .register %g3,#scratch
88 #endif
89
90 .section ".text",#alloc,#execinstr
91
92 .align 64
93 rem_4bit:
94 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
95 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
96 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
97 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
98 .type rem_4bit,#object
99 .size rem_4bit,(.-rem_4bit)
100
101 .globl gcm_ghash_4bit
102 .align 32
103 gcm_ghash_4bit:
104 save %sp,-$frame,%sp
105 ldub [$inp+15],$nlo
106 ldub [$Xi+15],$xi0
107 ldub [$Xi+14],$xi1
108 add $len,$inp,$len
109 add $Htbl,8,$Htblo
110
111 1: call .+8
112 add %o7,rem_4bit-1b,$rem_4bit
113
114 .Louter:
115 xor $xi0,$nlo,$nlo
116 and $nlo,0xf0,$nhi
117 and $nlo,0x0f,$nlo
118 sll $nlo,4,$nlo
119 ldx [$Htblo+$nlo],$Zlo
120 ldx [$Htbl+$nlo],$Zhi
121
122 ldub [$inp+14],$nlo
123
124 ldx [$Htblo+$nhi],$Tlo
125 and $Zlo,0xf,$remi
126 ldx [$Htbl+$nhi],$Thi
127 sll $remi,3,$remi
128 ldx [$rem_4bit+$remi],$rem
129 srlx $Zlo,4,$Zlo
130 mov 13,$cnt
131 sllx $Zhi,60,$tmp
132 xor $Tlo,$Zlo,$Zlo
133 srlx $Zhi,4,$Zhi
134 xor $Zlo,$tmp,$Zlo
135
136 xor $xi1,$nlo,$nlo
137 and $Zlo,0xf,$remi
138 and $nlo,0xf0,$nhi
139 and $nlo,0x0f,$nlo
140 ba .Lghash_inner
141 sll $nlo,4,$nlo
142 .align 32
143 .Lghash_inner:
144 ldx [$Htblo+$nlo],$Tlo
145 sll $remi,3,$remi
146 xor $Thi,$Zhi,$Zhi
147 ldx [$Htbl+$nlo],$Thi
148 srlx $Zlo,4,$Zlo
149 xor $rem,$Zhi,$Zhi
150 ldx [$rem_4bit+$remi],$rem
151 sllx $Zhi,60,$tmp
152 xor $Tlo,$Zlo,$Zlo
153 ldub [$inp+$cnt],$nlo
154 srlx $Zhi,4,$Zhi
155 xor $Zlo,$tmp,$Zlo
156 ldub [$Xi+$cnt],$xi1
157 xor $Thi,$Zhi,$Zhi
158 and $Zlo,0xf,$remi
159
160 ldx [$Htblo+$nhi],$Tlo
161 sll $remi,3,$remi
162 xor $rem,$Zhi,$Zhi
163 ldx [$Htbl+$nhi],$Thi
164 srlx $Zlo,4,$Zlo
165 ldx [$rem_4bit+$remi],$rem
166 sllx $Zhi,60,$tmp
167 xor $xi1,$nlo,$nlo
168 srlx $Zhi,4,$Zhi
169 and $nlo,0xf0,$nhi
170 addcc $cnt,-1,$cnt
171 xor $Zlo,$tmp,$Zlo
172 and $nlo,0x0f,$nlo
173 xor $Tlo,$Zlo,$Zlo
174 sll $nlo,4,$nlo
175 blu .Lghash_inner
176 and $Zlo,0xf,$remi
177
178 ldx [$Htblo+$nlo],$Tlo
179 sll $remi,3,$remi
180 xor $Thi,$Zhi,$Zhi
181 ldx [$Htbl+$nlo],$Thi
182 srlx $Zlo,4,$Zlo
183 xor $rem,$Zhi,$Zhi
184 ldx [$rem_4bit+$remi],$rem
185 sllx $Zhi,60,$tmp
186 xor $Tlo,$Zlo,$Zlo
187 srlx $Zhi,4,$Zhi
188 xor $Zlo,$tmp,$Zlo
189 xor $Thi,$Zhi,$Zhi
190
191 add $inp,16,$inp
192 cmp $inp,$len
193 be,pn SIZE_T_CC,.Ldone
194 and $Zlo,0xf,$remi
195
196 ldx [$Htblo+$nhi],$Tlo
197 sll $remi,3,$remi
198 xor $rem,$Zhi,$Zhi
199 ldx [$Htbl+$nhi],$Thi
200 srlx $Zlo,4,$Zlo
201 ldx [$rem_4bit+$remi],$rem
202 sllx $Zhi,60,$tmp
203 xor $Tlo,$Zlo,$Zlo
204 ldub [$inp+15],$nlo
205 srlx $Zhi,4,$Zhi
206 xor $Zlo,$tmp,$Zlo
207 xor $Thi,$Zhi,$Zhi
208 stx $Zlo,[$Xi+8]
209 xor $rem,$Zhi,$Zhi
210 stx $Zhi,[$Xi]
211 srl $Zlo,8,$xi1
212 and $Zlo,0xff,$xi0
213 ba .Louter
214 and $xi1,0xff,$xi1
215 .align 32
216 .Ldone:
217 ldx [$Htblo+$nhi],$Tlo
218 sll $remi,3,$remi
219 xor $rem,$Zhi,$Zhi
220 ldx [$Htbl+$nhi],$Thi
221 srlx $Zlo,4,$Zlo
222 ldx [$rem_4bit+$remi],$rem
223 sllx $Zhi,60,$tmp
224 xor $Tlo,$Zlo,$Zlo
225 srlx $Zhi,4,$Zhi
226 xor $Zlo,$tmp,$Zlo
227 xor $Thi,$Zhi,$Zhi
228 stx $Zlo,[$Xi+8]
229 xor $rem,$Zhi,$Zhi
230 stx $Zhi,[$Xi]
231
232 ret
233 restore
234 .type gcm_ghash_4bit,#function
235 .size gcm_ghash_4bit,(.-gcm_ghash_4bit)
236 ___
237
238 undef $inp;
239 undef $len;
240
241 $code.=<<___;
242 .globl gcm_gmult_4bit
243 .align 32
244 gcm_gmult_4bit:
245 save %sp,-$frame,%sp
246 ldub [$Xi+15],$nlo
247 add $Htbl,8,$Htblo
248
249 1: call .+8
250 add %o7,rem_4bit-1b,$rem_4bit
251
252 and $nlo,0xf0,$nhi
253 and $nlo,0x0f,$nlo
254 sll $nlo,4,$nlo
255 ldx [$Htblo+$nlo],$Zlo
256 ldx [$Htbl+$nlo],$Zhi
257
258 ldub [$Xi+14],$nlo
259
260 ldx [$Htblo+$nhi],$Tlo
261 and $Zlo,0xf,$remi
262 ldx [$Htbl+$nhi],$Thi
263 sll $remi,3,$remi
264 ldx [$rem_4bit+$remi],$rem
265 srlx $Zlo,4,$Zlo
266 mov 13,$cnt
267 sllx $Zhi,60,$tmp
268 xor $Tlo,$Zlo,$Zlo
269 srlx $Zhi,4,$Zhi
270 xor $Zlo,$tmp,$Zlo
271
272 and $Zlo,0xf,$remi
273 and $nlo,0xf0,$nhi
274 and $nlo,0x0f,$nlo
275 ba .Lgmult_inner
276 sll $nlo,4,$nlo
277 .align 32
278 .Lgmult_inner:
279 ldx [$Htblo+$nlo],$Tlo
280 sll $remi,3,$remi
281 xor $Thi,$Zhi,$Zhi
282 ldx [$Htbl+$nlo],$Thi
283 srlx $Zlo,4,$Zlo
284 xor $rem,$Zhi,$Zhi
285 ldx [$rem_4bit+$remi],$rem
286 sllx $Zhi,60,$tmp
287 xor $Tlo,$Zlo,$Zlo
288 ldub [$Xi+$cnt],$nlo
289 srlx $Zhi,4,$Zhi
290 xor $Zlo,$tmp,$Zlo
291 xor $Thi,$Zhi,$Zhi
292 and $Zlo,0xf,$remi
293
294 ldx [$Htblo+$nhi],$Tlo
295 sll $remi,3,$remi
296 xor $rem,$Zhi,$Zhi
297 ldx [$Htbl+$nhi],$Thi
298 srlx $Zlo,4,$Zlo
299 ldx [$rem_4bit+$remi],$rem
300 sllx $Zhi,60,$tmp
301 srlx $Zhi,4,$Zhi
302 and $nlo,0xf0,$nhi
303 addcc $cnt,-1,$cnt
304 xor $Zlo,$tmp,$Zlo
305 and $nlo,0x0f,$nlo
306 xor $Tlo,$Zlo,$Zlo
307 sll $nlo,4,$nlo
308 blu .Lgmult_inner
309 and $Zlo,0xf,$remi
310
311 ldx [$Htblo+$nlo],$Tlo
312 sll $remi,3,$remi
313 xor $Thi,$Zhi,$Zhi
314 ldx [$Htbl+$nlo],$Thi
315 srlx $Zlo,4,$Zlo
316 xor $rem,$Zhi,$Zhi
317 ldx [$rem_4bit+$remi],$rem
318 sllx $Zhi,60,$tmp
319 xor $Tlo,$Zlo,$Zlo
320 srlx $Zhi,4,$Zhi
321 xor $Zlo,$tmp,$Zlo
322 xor $Thi,$Zhi,$Zhi
323 and $Zlo,0xf,$remi
324
325 ldx [$Htblo+$nhi],$Tlo
326 sll $remi,3,$remi
327 xor $rem,$Zhi,$Zhi
328 ldx [$Htbl+$nhi],$Thi
329 srlx $Zlo,4,$Zlo
330 ldx [$rem_4bit+$remi],$rem
331 sllx $Zhi,60,$tmp
332 xor $Tlo,$Zlo,$Zlo
333 srlx $Zhi,4,$Zhi
334 xor $Zlo,$tmp,$Zlo
335 xor $Thi,$Zhi,$Zhi
336 stx $Zlo,[$Xi+8]
337 xor $rem,$Zhi,$Zhi
338 stx $Zhi,[$Xi]
339
340 ret
341 restore
342 .type gcm_gmult_4bit,#function
343 .size gcm_gmult_4bit,(.-gcm_gmult_4bit)
344 ___
345 \f
346 {{{
347 # Straightforward 128x128-bit multiplication using Karatsuba algorithm
348 # followed by pair of 64-bit reductions [with a shortcut in first one,
349 # which allowed to break dependency between reductions and remove one
350 # multiplication from critical path]. While it might be suboptimal
351 # with regard to sheer number of multiplications, other methods [such
352 # as aggregate reduction] would require more 64-bit registers, which
353 # we don't have in 32-bit application context.
354
355 ($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
356
357 ($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
358 (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
359
360 ($shl,$shr)=map("%l$_",(0..7));
361
362 # For details regarding "twisted H" see ghash-x86.pl.
363 $code.=<<___;
364 .globl gcm_init_vis3
365 .align 32
366 gcm_init_vis3:
367 save %sp,-$frame,%sp
368
369 ldx [%i1+0],$Hhi
370 ldx [%i1+8],$Hlo
371 mov 0xE1,$Xhi
372 mov 1,$Xlo
373 sllx $Xhi,57,$Xhi
374 srax $Hhi,63,$C0 ! broadcast carry
375 addcc $Hlo,$Hlo,$Hlo ! H<<=1
376 addxc $Hhi,$Hhi,$Hhi
377 and $C0,$Xlo,$Xlo
378 and $C0,$Xhi,$Xhi
379 xor $Xlo,$Hlo,$Hlo
380 xor $Xhi,$Hhi,$Hhi
381 stx $Hlo,[%i0+8] ! save twisted H
382 stx $Hhi,[%i0+0]
383
384 sethi %hi(0xA0406080),$V
385 sethi %hi(0x20C0E000),%l0
386 or $V,%lo(0xA0406080),$V
387 or %l0,%lo(0x20C0E000),%l0
388 sllx $V,32,$V
389 or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
390 stx $V,[%i0+16]
391
392 ret
393 restore
394 .type gcm_init_vis3,#function
395 .size gcm_init_vis3,.-gcm_init_vis3
396
397 .globl gcm_gmult_vis3
398 .align 32
399 gcm_gmult_vis3:
400 save %sp,-$frame,%sp
401
402 ldx [$Xip+8],$Xlo ! load Xi
403 ldx [$Xip+0],$Xhi
404 ldx [$Htable+8],$Hlo ! load twisted H
405 ldx [$Htable+0],$Hhi
406
407 mov 0xE1,%l7
408 sllx %l7,57,$xE1 ! 57 is not a typo
409 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
410
411 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
412 xmulx $Xlo,$Hlo,$C0
413 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
414 xmulx $C2,$Hhl,$C1
415 xmulxhi $Xlo,$Hlo,$Xlo
416 xmulxhi $C2,$Hhl,$C2
417 xmulxhi $Xhi,$Hhi,$C3
418 xmulx $Xhi,$Hhi,$Xhi
419
420 sll $C0,3,$sqr
421 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
422 xor $C0,$sqr,$sqr
423 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
424
425 xor $C0,$C1,$C1 ! Karatsuba post-processing
426 xor $Xlo,$C2,$C2
427 xor $sqr,$Xlo,$Xlo ! real destination is $C1
428 xor $C3,$C2,$C2
429 xor $Xlo,$C1,$C1
430 xor $Xhi,$C2,$C2
431 xor $Xhi,$C1,$C1
432
433 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
434 xor $C0,$C2,$C2
435 xmulx $C1,$xE1,$C0
436 xor $C1,$C3,$C3
437 xmulxhi $C1,$xE1,$C1
438
439 xor $Xlo,$C2,$C2
440 xor $C0,$C2,$C2
441 xor $C1,$C3,$C3
442
443 stx $C2,[$Xip+8] ! save Xi
444 stx $C3,[$Xip+0]
445
446 ret
447 restore
448 .type gcm_gmult_vis3,#function
449 .size gcm_gmult_vis3,.-gcm_gmult_vis3
450
451 .globl gcm_ghash_vis3
452 .align 32
453 gcm_ghash_vis3:
454 save %sp,-$frame,%sp
455 nop
456 srln $len,0,$len ! needed on v8+, "nop" on v9
457
458 ldx [$Xip+8],$C2 ! load Xi
459 ldx [$Xip+0],$C3
460 ldx [$Htable+8],$Hlo ! load twisted H
461 ldx [$Htable+0],$Hhi
462
463 mov 0xE1,%l7
464 sllx %l7,57,$xE1 ! 57 is not a typo
465 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
466
467 and $inp,7,$shl
468 andn $inp,7,$inp
469 sll $shl,3,$shl
470 prefetch [$inp+63], 20
471 sub %g0,$shl,$shr
472
473 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
474 .Loop:
475 ldx [$inp+8],$Xlo
476 brz,pt $shl,1f
477 ldx [$inp+0],$Xhi
478
479 ldx [$inp+16],$C1 ! align data
480 srlx $Xlo,$shr,$C0
481 sllx $Xlo,$shl,$Xlo
482 sllx $Xhi,$shl,$Xhi
483 srlx $C1,$shr,$C1
484 or $C0,$Xhi,$Xhi
485 or $C1,$Xlo,$Xlo
486 1:
487 add $inp,16,$inp
488 sub $len,16,$len
489 xor $C2,$Xlo,$Xlo
490 xor $C3,$Xhi,$Xhi
491 prefetch [$inp+63], 20
492
493 xmulx $Xlo,$Hlo,$C0
494 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
495 xmulx $C2,$Hhl,$C1
496 xmulxhi $Xlo,$Hlo,$Xlo
497 xmulxhi $C2,$Hhl,$C2
498 xmulxhi $Xhi,$Hhi,$C3
499 xmulx $Xhi,$Hhi,$Xhi
500
501 sll $C0,3,$sqr
502 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
503 xor $C0,$sqr,$sqr
504 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
505
506 xor $C0,$C1,$C1 ! Karatsuba post-processing
507 xor $Xlo,$C2,$C2
508 xor $sqr,$Xlo,$Xlo ! real destination is $C1
509 xor $C3,$C2,$C2
510 xor $Xlo,$C1,$C1
511 xor $Xhi,$C2,$C2
512 xor $Xhi,$C1,$C1
513
514 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
515 xor $C0,$C2,$C2
516 xmulx $C1,$xE1,$C0
517 xor $C1,$C3,$C3
518 xmulxhi $C1,$xE1,$C1
519
520 xor $Xlo,$C2,$C2
521 xor $C0,$C2,$C2
522 brnz,pt $len,.Loop
523 xor $C1,$C3,$C3
524
525 stx $C2,[$Xip+8] ! save Xi
526 stx $C3,[$Xip+0]
527
528 ret
529 restore
530 .type gcm_ghash_vis3,#function
531 .size gcm_ghash_vis3,.-gcm_ghash_vis3
532 ___
533 }}}
534 $code.=<<___;
535 .asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
536 .align 4
537 ___
538
539 \f
540 # Purpose of these subroutines is to explicitly encode VIS instructions,
541 # so that one can compile the module without having to specify VIS
542 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
543 # Idea is to reserve for option to produce "universal" binary and let
544 # programmer detect if current CPU is VIS capable at run-time.
545 sub unvis3 {
546 my ($mnemonic,$rs1,$rs2,$rd)=@_;
547 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
548 my ($ref,$opf);
549 my %visopf = ( "addxc" => 0x011,
550 "addxccc" => 0x013,
551 "xmulx" => 0x115,
552 "xmulxhi" => 0x116 );
553
554 $ref = "$mnemonic\t$rs1,$rs2,$rd";
555
556 if ($opf=$visopf{$mnemonic}) {
557 foreach ($rs1,$rs2,$rd) {
558 return $ref if (!/%([goli])([0-9])/);
559 $_=$bias{$1}+$2;
560 }
561
562 return sprintf ".word\t0x%08x !%s",
563 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
564 $ref;
565 } else {
566 return $ref;
567 }
568 }
569
570 foreach (split("\n",$code)) {
571 s/\`([^\`]*)\`/eval $1/ge;
572
573 s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
574 &unvis3($1,$2,$3,$4)
575 /ge;
576
577 print $_,"\n";
578 }
579
580 close STDOUT or die "error closing STDOUT: $!";