]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/poly1305/asm/poly1305-sparcv9.pl
Following the license change, modify the boilerplates in crypto/poly1305/
[thirdparty/openssl.git] / crypto / poly1305 / asm / poly1305-sparcv9.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements Poly1305 hash for SPARCv9, vanilla, as well
18 # as VIS3 and FMA extensions.
19 #
20 # May, August 2015
21 #
22 # Numbers are cycles per processed byte with poly1305_blocks alone.
23 #
24 # IALU(*) FMA
25 #
26 # UltraSPARC III 12.3(**)
27 # SPARC T3 7.92
28 # SPARC T4 1.70(***) 6.55
29 # SPARC64 X 5.60 3.64
30 #
31 # (*) Comparison to compiler-generated code is really problematic,
32 # because latter's performance varies too much depending on too
33 # many variables. For example, one can measure from 5x to 15x
34 # improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
35 # unfair comparison, because compiler doesn't use VIS3, but
36 # given same initial conditions coefficient varies from 3x to 9x.
37 # (**) Pre-III performance should be even worse; floating-point
38 # performance for UltraSPARC I-IV on the other hand is reported
39 # to be 4.25 for hand-coded assembly, but they are just too old
40 # to care about.
41 # (***) Multi-process benchmark saturates at ~12.5x single-process
42 # result on 8-core processor, or ~21GBps per 2.85GHz socket.
43
44 my $output = pop;
45 open STDOUT,">$output";
46
47 my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5));
48 my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7));
49 my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7));
50 my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4));
51
52 my $output = pop;
53 open STDOUT,">$stdout";
54
55 $code.=<<___;
56 #include "sparc_arch.h"
57
58 #ifdef __arch64__
59 .register %g2,#scratch
60 .register %g3,#scratch
61 # define STPTR stx
62 # define SIZE_T 8
63 #else
64 # define STPTR st
65 # define SIZE_T 4
66 #endif
67 #define LOCALS (STACK_BIAS+STACK_FRAME)
68
69 .section ".text",#alloc,#execinstr
70
71 #ifdef __PIC__
72 SPARC_PIC_THUNK(%g1)
73 #endif
74
75 .globl poly1305_init
76 .align 32
77 poly1305_init:
78 save %sp,-STACK_FRAME-16,%sp
79 nop
80
81 SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
82 ld [%g1],%g1
83
84 and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
85 cmp %g1,SPARCV9_FMADD
86 be .Lpoly1305_init_fma
87 nop
88
89 stx %g0,[$ctx+0]
90 stx %g0,[$ctx+8] ! zero hash value
91 brz,pn $inp,.Lno_key
92 stx %g0,[$ctx+16]
93
94 and $inp,7,$shr ! alignment factor
95 andn $inp,7,$inp
96 sll $shr,3,$shr ! *8
97 neg $shr,$shl
98
99 sethi %hi(0x0ffffffc),$t0
100 set 8,$h1
101 or $t0,%lo(0x0ffffffc),$t0
102 set 16,$h2
103 sllx $t0,32,$t1
104 or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc
105 or $t1,3,$t0 ! 0x0ffffffc0fffffff
106
107 ldxa [$inp+%g0]0x88,$h0 ! load little-endian key
108 brz,pt $shr,.Lkey_aligned
109 ldxa [$inp+$h1]0x88,$h1
110
111 ldxa [$inp+$h2]0x88,$h2
112 srlx $h0,$shr,$h0
113 sllx $h1,$shl,$t2
114 srlx $h1,$shr,$h1
115 or $t2,$h0,$h0
116 sllx $h2,$shl,$h2
117 or $h2,$h1,$h1
118
119 .Lkey_aligned:
120 and $t0,$h0,$h0
121 and $t1,$h1,$h1
122 stx $h0,[$ctx+32+0] ! store key
123 stx $h1,[$ctx+32+8]
124
125 andcc %g1,SPARCV9_VIS3,%g0
126 be .Lno_key
127 nop
128
129 1: call .+8
130 add %o7,poly1305_blocks_vis3-1b,%o7
131
132 add %o7,poly1305_emit-poly1305_blocks_vis3,%o5
133 STPTR %o7,[%i2]
134 STPTR %o5,[%i2+SIZE_T]
135
136 ret
137 restore %g0,1,%o0 ! return 1
138
139 .Lno_key:
140 ret
141 restore %g0,%g0,%o0 ! return 0
142 .type poly1305_init,#function
143 .size poly1305_init,.-poly1305_init
144
145 .globl poly1305_blocks
146 .align 32
147 poly1305_blocks:
148 save %sp,-STACK_FRAME,%sp
149 srln $len,4,$len
150
151 brz,pn $len,.Lno_data
152 nop
153
154 ld [$ctx+32+0],$r1 ! load key
155 ld [$ctx+32+4],$r0
156 ld [$ctx+32+8],$r3
157 ld [$ctx+32+12],$r2
158
159 ld [$ctx+0],$h1 ! load hash value
160 ld [$ctx+4],$h0
161 ld [$ctx+8],$h3
162 ld [$ctx+12],$h2
163 ld [$ctx+16],$h4
164
165 and $inp,7,$shr ! alignment factor
166 andn $inp,7,$inp
167 set 8,$d1
168 sll $shr,3,$shr ! *8
169 set 16,$d2
170 neg $shr,$shl
171
172 srl $r1,2,$s1
173 srl $r2,2,$s2
174 add $r1,$s1,$s1
175 srl $r3,2,$s3
176 add $r2,$s2,$s2
177 add $r3,$s3,$s3
178
179 .Loop:
180 ldxa [$inp+%g0]0x88,$d0 ! load little-endian input
181 brz,pt $shr,.Linp_aligned
182 ldxa [$inp+$d1]0x88,$d1
183
184 ldxa [$inp+$d2]0x88,$d2
185 srlx $d0,$shr,$d0
186 sllx $d1,$shl,$t1
187 srlx $d1,$shr,$d1
188 or $t1,$d0,$d0
189 sllx $d2,$shl,$d2
190 or $d2,$d1,$d1
191
192 .Linp_aligned:
193 srlx $d0,32,$t0
194 addcc $d0,$h0,$h0 ! accumulate input
195 srlx $d1,32,$t1
196 addccc $t0,$h1,$h1
197 addccc $d1,$h2,$h2
198 addccc $t1,$h3,$h3
199 addc $padbit,$h4,$h4
200
201 umul $r0,$h0,$d0
202 umul $r1,$h0,$d1
203 umul $r2,$h0,$d2
204 umul $r3,$h0,$d3
205 sub $len,1,$len
206 add $inp,16,$inp
207
208 umul $s3,$h1,$t0
209 umul $r0,$h1,$t1
210 umul $r1,$h1,$t2
211 add $t0,$d0,$d0
212 add $t1,$d1,$d1
213 umul $r2,$h1,$t0
214 add $t2,$d2,$d2
215 add $t0,$d3,$d3
216
217 umul $s2,$h2,$t1
218 umul $s3,$h2,$t2
219 umul $r0,$h2,$t0
220 add $t1,$d0,$d0
221 add $t2,$d1,$d1
222 umul $r1,$h2,$t1
223 add $t0,$d2,$d2
224 add $t1,$d3,$d3
225
226 umul $s1,$h3,$t2
227 umul $s2,$h3,$t0
228 umul $s3,$h3,$t1
229 add $t2,$d0,$d0
230 add $t0,$d1,$d1
231 umul $r0,$h3,$t2
232 add $t1,$d2,$d2
233 add $t2,$d3,$d3
234
235 umul $s1,$h4,$t0
236 umul $s2,$h4,$t1
237 umul $s3,$h4,$t2
238 umul $r0,$h4,$h4
239 add $t0,$d1,$d1
240 add $t1,$d2,$d2
241 srlx $d0,32,$h1
242 add $t2,$d3,$d3
243 srlx $d1,32,$h2
244
245 addcc $d1,$h1,$h1
246 srlx $d2,32,$h3
247 set 8,$d1
248 addccc $d2,$h2,$h2
249 srlx $d3,32,$t0
250 set 16,$d2
251 addccc $d3,$h3,$h3
252 addc $t0,$h4,$h4
253
254 srl $h4,2,$t0 ! final reduction step
255 andn $h4,3,$t1
256 and $h4,3,$h4
257 add $t1,$t0,$t0
258
259 addcc $t0,$d0,$h0
260 addccc %g0,$h1,$h1
261 addccc %g0,$h2,$h2
262 addccc %g0,$h3,$h3
263 brnz,pt $len,.Loop
264 addc %g0,$h4,$h4
265
266 st $h1,[$ctx+0] ! store hash value
267 st $h0,[$ctx+4]
268 st $h3,[$ctx+8]
269 st $h2,[$ctx+12]
270 st $h4,[$ctx+16]
271
272 .Lno_data:
273 ret
274 restore
275 .type poly1305_blocks,#function
276 .size poly1305_blocks,.-poly1305_blocks
277 ___
278 ########################################################################
279 # VIS3 has umulxhi and addxc...
280 {
281 my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
282 my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
283
284 $code.=<<___;
285 .align 32
286 poly1305_blocks_vis3:
287 save %sp,-STACK_FRAME,%sp
288 srln $len,4,$len
289
290 brz,pn $len,.Lno_data
291 nop
292
293 ldx [$ctx+32+0],$R0 ! load key
294 ldx [$ctx+32+8],$R1
295
296 ldx [$ctx+0],$H0 ! load hash value
297 ldx [$ctx+8],$H1
298 ld [$ctx+16],$H2
299
300 and $inp,7,$shr ! alignment factor
301 andn $inp,7,$inp
302 set 8,$r1
303 sll $shr,3,$shr ! *8
304 set 16,$r2
305 neg $shr,$shl
306
307 srlx $R1,2,$S1
308 b .Loop_vis3
309 add $R1,$S1,$S1
310
311 .Loop_vis3:
312 ldxa [$inp+%g0]0x88,$D0 ! load little-endian input
313 brz,pt $shr,.Linp_aligned_vis3
314 ldxa [$inp+$r1]0x88,$D1
315
316 ldxa [$inp+$r2]0x88,$D2
317 srlx $D0,$shr,$D0
318 sllx $D1,$shl,$T1
319 srlx $D1,$shr,$D1
320 or $T1,$D0,$D0
321 sllx $D2,$shl,$D2
322 or $D2,$D1,$D1
323
324 .Linp_aligned_vis3:
325 addcc $D0,$H0,$H0 ! accumulate input
326 sub $len,1,$len
327 addxccc $D1,$H1,$H1
328 add $inp,16,$inp
329
330 mulx $R0,$H0,$D0 ! r0*h0
331 addxc $padbit,$H2,$H2
332 umulxhi $R0,$H0,$D1
333 mulx $S1,$H1,$T0 ! s1*h1
334 umulxhi $S1,$H1,$T1
335 addcc $T0,$D0,$D0
336 mulx $R1,$H0,$T0 ! r1*h0
337 addxc $T1,$D1,$D1
338 umulxhi $R1,$H0,$D2
339 addcc $T0,$D1,$D1
340 mulx $R0,$H1,$T0 ! r0*h1
341 addxc %g0,$D2,$D2
342 umulxhi $R0,$H1,$T1
343 addcc $T0,$D1,$D1
344 mulx $S1,$H2,$T0 ! s1*h2
345 addxc $T1,$D2,$D2
346 mulx $R0,$H2,$T1 ! r0*h2
347 addcc $T0,$D1,$D1
348 addxc $T1,$D2,$D2
349
350 srlx $D2,2,$T0 ! final reduction step
351 andn $D2,3,$T1
352 and $D2,3,$H2
353 add $T1,$T0,$T0
354
355 addcc $T0,$D0,$H0
356 addxccc %g0,$D1,$H1
357 brnz,pt $len,.Loop_vis3
358 addxc %g0,$H2,$H2
359
360 stx $H0,[$ctx+0] ! store hash value
361 stx $H1,[$ctx+8]
362 st $H2,[$ctx+16]
363
364 ret
365 restore
366 .type poly1305_blocks_vis3,#function
367 .size poly1305_blocks_vis3,.-poly1305_blocks_vis3
368 ___
369 }
370 my ($mac,$nonce) = ($inp,$len);
371
372 $code.=<<___;
373 .globl poly1305_emit
374 .align 32
375 poly1305_emit:
376 save %sp,-STACK_FRAME,%sp
377
378 ld [$ctx+0],$h1 ! load hash value
379 ld [$ctx+4],$h0
380 ld [$ctx+8],$h3
381 ld [$ctx+12],$h2
382 ld [$ctx+16],$h4
383
384 addcc $h0,5,$r0 ! compare to modulus
385 addccc $h1,0,$r1
386 addccc $h2,0,$r2
387 addccc $h3,0,$r3
388 addc $h4,0,$h4
389 andcc $h4,4,%g0 ! did it carry/borrow?
390
391 movnz %icc,$r0,$h0
392 ld [$nonce+0],$r0 ! load nonce
393 movnz %icc,$r1,$h1
394 ld [$nonce+4],$r1
395 movnz %icc,$r2,$h2
396 ld [$nonce+8],$r2
397 movnz %icc,$r3,$h3
398 ld [$nonce+12],$r3
399
400 addcc $r0,$h0,$h0 ! accumulate nonce
401 addccc $r1,$h1,$h1
402 addccc $r2,$h2,$h2
403 addc $r3,$h3,$h3
404
405 srl $h0,8,$r0
406 stb $h0,[$mac+0] ! store little-endian result
407 srl $h0,16,$r1
408 stb $r0,[$mac+1]
409 srl $h0,24,$r2
410 stb $r1,[$mac+2]
411 stb $r2,[$mac+3]
412
413 srl $h1,8,$r0
414 stb $h1,[$mac+4]
415 srl $h1,16,$r1
416 stb $r0,[$mac+5]
417 srl $h1,24,$r2
418 stb $r1,[$mac+6]
419 stb $r2,[$mac+7]
420
421 srl $h2,8,$r0
422 stb $h2,[$mac+8]
423 srl $h2,16,$r1
424 stb $r0,[$mac+9]
425 srl $h2,24,$r2
426 stb $r1,[$mac+10]
427 stb $r2,[$mac+11]
428
429 srl $h3,8,$r0
430 stb $h3,[$mac+12]
431 srl $h3,16,$r1
432 stb $r0,[$mac+13]
433 srl $h3,24,$r2
434 stb $r1,[$mac+14]
435 stb $r2,[$mac+15]
436
437 ret
438 restore
439 .type poly1305_emit,#function
440 .size poly1305_emit,.-poly1305_emit
441 ___
442
443 {
444 my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
445 my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
446 my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
447 my $i2=$step;
448
449 my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
450 $two0,$two32,$two64,$two96,$two130,$five_two130,
451 $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
452 $s2lo,$s2hi,$s3lo,$s3hi,
453 $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
454 # borrowings
455 my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
456 my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
457 my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
458
459 $code.=<<___;
460 .align 32
461 poly1305_init_fma:
462 save %sp,-STACK_FRAME-16,%sp
463 nop
464
465 .Lpoly1305_init_fma:
466 1: call .+8
467 add %o7,.Lconsts_fma-1b,%o7
468
469 ldd [%o7+8*0],$two0 ! load constants
470 ldd [%o7+8*1],$two32
471 ldd [%o7+8*2],$two64
472 ldd [%o7+8*3],$two96
473 ldd [%o7+8*5],$five_two130
474
475 std $two0,[$ctx+8*0] ! initial hash value, biased 0
476 std $two32,[$ctx+8*1]
477 std $two64,[$ctx+8*2]
478 std $two96,[$ctx+8*3]
479
480 brz,pn $inp,.Lno_key_fma
481 nop
482
483 stx %fsr,[%sp+LOCALS] ! save original %fsr
484 ldx [%o7+8*6],%fsr ! load new %fsr
485
486 std $two0,[$ctx+8*4] ! key "template"
487 std $two32,[$ctx+8*5]
488 std $two64,[$ctx+8*6]
489 std $two96,[$ctx+8*7]
490
491 and $inp,7,$shr
492 andn $inp,7,$inp ! align pointer
493 mov 8,$i1
494 sll $shr,3,$shr
495 mov 16,$i2
496 neg $shr,$shl
497
498 ldxa [$inp+%g0]0x88,$in0 ! load little-endian key
499 ldxa [$inp+$i1]0x88,$in2
500
501 brz $shr,.Lkey_aligned_fma
502 sethi %hi(0xf0000000),$i1 ! 0xf0000000
503
504 ldxa [$inp+$i2]0x88,$in4
505
506 srlx $in0,$shr,$in0 ! align data
507 sllx $in2,$shl,$in1
508 srlx $in2,$shr,$in2
509 or $in1,$in0,$in0
510 sllx $in4,$shl,$in3
511 or $in3,$in2,$in2
512
513 .Lkey_aligned_fma:
514 or $i1,3,$i2 ! 0xf0000003
515 srlx $in0,32,$in1
516 andn $in0,$i1,$in0 ! &=0x0fffffff
517 andn $in1,$i2,$in1 ! &=0x0ffffffc
518 srlx $in2,32,$in3
519 andn $in2,$i2,$in2
520 andn $in3,$i2,$in3
521
522 st $in0,[$ctx+`8*4+4`] ! fill "template"
523 st $in1,[$ctx+`8*5+4`]
524 st $in2,[$ctx+`8*6+4`]
525 st $in3,[$ctx+`8*7+4`]
526
527 ldd [$ctx+8*4],$h0lo ! load [biased] key
528 ldd [$ctx+8*5],$h1lo
529 ldd [$ctx+8*6],$h2lo
530 ldd [$ctx+8*7],$h3lo
531
532 fsubd $h0lo,$two0, $h0lo ! r0
533 ldd [%o7+8*7],$two0 ! more constants
534 fsubd $h1lo,$two32,$h1lo ! r1
535 ldd [%o7+8*8],$two32
536 fsubd $h2lo,$two64,$h2lo ! r2
537 ldd [%o7+8*9],$two64
538 fsubd $h3lo,$two96,$h3lo ! r3
539 ldd [%o7+8*10],$two96
540
541 fmuld $five_two130,$h1lo,$s1lo ! s1
542 fmuld $five_two130,$h2lo,$s2lo ! s2
543 fmuld $five_two130,$h3lo,$s3lo ! s3
544
545 faddd $h0lo,$two0, $h0hi
546 faddd $h1lo,$two32,$h1hi
547 faddd $h2lo,$two64,$h2hi
548 faddd $h3lo,$two96,$h3hi
549
550 fsubd $h0hi,$two0, $h0hi
551 ldd [%o7+8*11],$two0 ! more constants
552 fsubd $h1hi,$two32,$h1hi
553 ldd [%o7+8*12],$two32
554 fsubd $h2hi,$two64,$h2hi
555 ldd [%o7+8*13],$two64
556 fsubd $h3hi,$two96,$h3hi
557
558 fsubd $h0lo,$h0hi,$h0lo
559 std $h0hi,[$ctx+8*5] ! r0hi
560 fsubd $h1lo,$h1hi,$h1lo
561 std $h1hi,[$ctx+8*7] ! r1hi
562 fsubd $h2lo,$h2hi,$h2lo
563 std $h2hi,[$ctx+8*9] ! r2hi
564 fsubd $h3lo,$h3hi,$h3lo
565 std $h3hi,[$ctx+8*11] ! r3hi
566
567 faddd $s1lo,$two0, $s1hi
568 faddd $s2lo,$two32,$s2hi
569 faddd $s3lo,$two64,$s3hi
570
571 fsubd $s1hi,$two0, $s1hi
572 fsubd $s2hi,$two32,$s2hi
573 fsubd $s3hi,$two64,$s3hi
574
575 fsubd $s1lo,$s1hi,$s1lo
576 fsubd $s2lo,$s2hi,$s2lo
577 fsubd $s3lo,$s3hi,$s3lo
578
579 ldx [%sp+LOCALS],%fsr ! restore %fsr
580
581 std $h0lo,[$ctx+8*4] ! r0lo
582 std $h1lo,[$ctx+8*6] ! r1lo
583 std $h2lo,[$ctx+8*8] ! r2lo
584 std $h3lo,[$ctx+8*10] ! r3lo
585
586 std $s1hi,[$ctx+8*13]
587 std $s2hi,[$ctx+8*15]
588 std $s3hi,[$ctx+8*17]
589
590 std $s1lo,[$ctx+8*12]
591 std $s2lo,[$ctx+8*14]
592 std $s3lo,[$ctx+8*16]
593
594 add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
595 add %o7,poly1305_emit_fma-.Lconsts_fma,%o1
596 STPTR %o0,[%i2]
597 STPTR %o1,[%i2+SIZE_T]
598
599 ret
600 restore %g0,1,%o0 ! return 1
601
602 .Lno_key_fma:
603 ret
604 restore %g0,%g0,%o0 ! return 0
605 .type poly1305_init_fma,#function
606 .size poly1305_init_fma,.-poly1305_init_fma
607
608 .align 32
609 poly1305_blocks_fma:
610 save %sp,-STACK_FRAME-48,%sp
611 srln $len,4,$len
612
613 brz,pn $len,.Labort
614 sub $len,1,$len
615
616 1: call .+8
617 add %o7,.Lconsts_fma-1b,%o7
618
619 ldd [%o7+8*0],$two0 ! load constants
620 ldd [%o7+8*1],$two32
621 ldd [%o7+8*2],$two64
622 ldd [%o7+8*3],$two96
623 ldd [%o7+8*4],$two130
624 ldd [%o7+8*5],$five_two130
625
626 ldd [$ctx+8*0],$h0lo ! load [biased] hash value
627 ldd [$ctx+8*1],$h1lo
628 ldd [$ctx+8*2],$h2lo
629 ldd [$ctx+8*3],$h3lo
630
631 std $two0,[%sp+LOCALS+8*0] ! input "template"
632 sethi %hi((1023+52+96)<<20),$in3
633 std $two32,[%sp+LOCALS+8*1]
634 or $padbit,$in3,$in3
635 std $two64,[%sp+LOCALS+8*2]
636 st $in3,[%sp+LOCALS+8*3]
637
638 and $inp,7,$shr
639 andn $inp,7,$inp ! align pointer
640 mov 8,$i1
641 sll $shr,3,$shr
642 mov 16,$step
643 neg $shr,$shl
644
645 ldxa [$inp+%g0]0x88,$in0 ! load little-endian input
646 brz $shr,.Linp_aligned_fma
647 ldxa [$inp+$i1]0x88,$in2
648
649 ldxa [$inp+$step]0x88,$in4
650 add $inp,8,$inp
651
652 srlx $in0,$shr,$in0 ! align data
653 sllx $in2,$shl,$in1
654 srlx $in2,$shr,$in2
655 or $in1,$in0,$in0
656 sllx $in4,$shl,$in3
657 srlx $in4,$shr,$in4 ! pre-shift
658 or $in3,$in2,$in2
659
660 .Linp_aligned_fma:
661 srlx $in0,32,$in1
662 movrz $len,0,$step
663 srlx $in2,32,$in3
664 add $step,$inp,$inp ! conditional advance
665
666 st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
667 st $in1,[%sp+LOCALS+8*1+4]
668 st $in2,[%sp+LOCALS+8*2+4]
669 st $in3,[%sp+LOCALS+8*3+4]
670
671 ldd [$ctx+8*4],$r0lo ! load key
672 ldd [$ctx+8*5],$r0hi
673 ldd [$ctx+8*6],$r1lo
674 ldd [$ctx+8*7],$r1hi
675 ldd [$ctx+8*8],$r2lo
676 ldd [$ctx+8*9],$r2hi
677 ldd [$ctx+8*10],$r3lo
678 ldd [$ctx+8*11],$r3hi
679 ldd [$ctx+8*12],$s1lo
680 ldd [$ctx+8*13],$s1hi
681 ldd [$ctx+8*14],$s2lo
682 ldd [$ctx+8*15],$s2hi
683 ldd [$ctx+8*16],$s3lo
684 ldd [$ctx+8*17],$s3hi
685
686 stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr
687 ldx [%o7+8*6],%fsr ! load new %fsr
688
689 subcc $len,1,$len
690 movrz $len,0,$step
691
692 ldd [%sp+LOCALS+8*0],$x0 ! load biased input
693 ldd [%sp+LOCALS+8*1],$x1
694 ldd [%sp+LOCALS+8*2],$x2
695 ldd [%sp+LOCALS+8*3],$x3
696
697 fsubd $h0lo,$two0, $h0lo ! de-bias hash value
698 fsubd $h1lo,$two32,$h1lo
699 ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
700 fsubd $h2lo,$two64,$h2lo
701 fsubd $h3lo,$two96,$h3lo
702 ldxa [$inp+$i1]0x88,$in2
703
704 fsubd $x0,$two0, $x0 ! de-bias input
705 fsubd $x1,$two32,$x1
706 fsubd $x2,$two64,$x2
707 fsubd $x3,$two96,$x3
708
709 brz $shr,.Linp_aligned_fma2
710 add $step,$inp,$inp ! conditional advance
711
712 sllx $in0,$shl,$in1 ! align data
713 srlx $in0,$shr,$in3
714 or $in1,$in4,$in0
715 sllx $in2,$shl,$in1
716 srlx $in2,$shr,$in4 ! pre-shift
717 or $in3,$in1,$in2
718 .Linp_aligned_fma2:
719 srlx $in0,32,$in1
720 srlx $in2,32,$in3
721
722 faddd $h0lo,$x0,$x0 ! accumulate input
723 stw $in0,[%sp+LOCALS+8*0+4]
724 faddd $h1lo,$x1,$x1
725 stw $in1,[%sp+LOCALS+8*1+4]
726 faddd $h2lo,$x2,$x2
727 stw $in2,[%sp+LOCALS+8*2+4]
728 faddd $h3lo,$x3,$x3
729 stw $in3,[%sp+LOCALS+8*3+4]
730
731 b .Lentry_fma
732 nop
733
734 .align 16
735 .Loop_fma:
736 ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
737 ldxa [$inp+$i1]0x88,$in2
738 movrz $len,0,$step
739
740 faddd $y0,$h0lo,$h0lo ! accumulate input
741 faddd $y1,$h0hi,$h0hi
742 faddd $y2,$h2lo,$h2lo
743 faddd $y3,$h2hi,$h2hi
744
745 brz,pn $shr,.Linp_aligned_fma3
746 add $step,$inp,$inp ! conditional advance
747
748 sllx $in0,$shl,$in1 ! align data
749 srlx $in0,$shr,$in3
750 or $in1,$in4,$in0
751 sllx $in2,$shl,$in1
752 srlx $in2,$shr,$in4 ! pre-shift
753 or $in3,$in1,$in2
754
755 .Linp_aligned_fma3:
756 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
757 faddd $two64,$h1lo,$c1lo
758 srlx $in0,32,$in1
759 faddd $two64,$h1hi,$c1hi
760 srlx $in2,32,$in3
761 faddd $two130,$h3lo,$c3lo
762 st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
763 faddd $two130,$h3hi,$c3hi
764 st $in1,[%sp+LOCALS+8*1+4]
765 faddd $two32,$h0lo,$c0lo
766 st $in2,[%sp+LOCALS+8*2+4]
767 faddd $two32,$h0hi,$c0hi
768 st $in3,[%sp+LOCALS+8*3+4]
769 faddd $two96,$h2lo,$c2lo
770 faddd $two96,$h2hi,$c2hi
771
772 fsubd $c1lo,$two64,$c1lo
773 fsubd $c1hi,$two64,$c1hi
774 fsubd $c3lo,$two130,$c3lo
775 fsubd $c3hi,$two130,$c3hi
776 fsubd $c0lo,$two32,$c0lo
777 fsubd $c0hi,$two32,$c0hi
778 fsubd $c2lo,$two96,$c2lo
779 fsubd $c2hi,$two96,$c2hi
780
781 fsubd $h1lo,$c1lo,$h1lo
782 fsubd $h1hi,$c1hi,$h1hi
783 fsubd $h3lo,$c3lo,$h3lo
784 fsubd $h3hi,$c3hi,$h3hi
785 fsubd $h2lo,$c2lo,$h2lo
786 fsubd $h2hi,$c2hi,$h2hi
787 fsubd $h0lo,$c0lo,$h0lo
788 fsubd $h0hi,$c0hi,$h0hi
789
790 faddd $h1lo,$c0lo,$h1lo
791 faddd $h1hi,$c0hi,$h1hi
792 faddd $h3lo,$c2lo,$h3lo
793 faddd $h3hi,$c2hi,$h3hi
794 faddd $h2lo,$c1lo,$h2lo
795 faddd $h2hi,$c1hi,$h2hi
796 fmaddd $five_two130,$c3lo,$h0lo,$h0lo
797 fmaddd $five_two130,$c3hi,$h0hi,$h0hi
798
799 faddd $h1lo,$h1hi,$x1
800 ldd [$ctx+8*12],$s1lo ! reload constants
801 faddd $h3lo,$h3hi,$x3
802 ldd [$ctx+8*13],$s1hi
803 faddd $h2lo,$h2hi,$x2
804 ldd [$ctx+8*10],$r3lo
805 faddd $h0lo,$h0hi,$x0
806 ldd [$ctx+8*11],$r3hi
807
808 .Lentry_fma:
809 fmuld $x1,$s3lo,$h0lo
810 fmuld $x1,$s3hi,$h0hi
811 fmuld $x1,$r1lo,$h2lo
812 fmuld $x1,$r1hi,$h2hi
813 fmuld $x1,$r0lo,$h1lo
814 fmuld $x1,$r0hi,$h1hi
815 fmuld $x1,$r2lo,$h3lo
816 fmuld $x1,$r2hi,$h3hi
817
818 fmaddd $x3,$s1lo,$h0lo,$h0lo
819 fmaddd $x3,$s1hi,$h0hi,$h0hi
820 fmaddd $x3,$s3lo,$h2lo,$h2lo
821 fmaddd $x3,$s3hi,$h2hi,$h2hi
822 fmaddd $x3,$s2lo,$h1lo,$h1lo
823 fmaddd $x3,$s2hi,$h1hi,$h1hi
824 fmaddd $x3,$r0lo,$h3lo,$h3lo
825 fmaddd $x3,$r0hi,$h3hi,$h3hi
826
827 fmaddd $x2,$s2lo,$h0lo,$h0lo
828 fmaddd $x2,$s2hi,$h0hi,$h0hi
829 fmaddd $x2,$r0lo,$h2lo,$h2lo
830 fmaddd $x2,$r0hi,$h2hi,$h2hi
831 fmaddd $x2,$s3lo,$h1lo,$h1lo
832 ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input
833 fmaddd $x2,$s3hi,$h1hi,$h1hi
834 ldd [%sp+LOCALS+8*1],$y1
835 fmaddd $x2,$r1lo,$h3lo,$h3lo
836 ldd [%sp+LOCALS+8*2],$y2
837 fmaddd $x2,$r1hi,$h3hi,$h3hi
838 ldd [%sp+LOCALS+8*3],$y3
839
840 fmaddd $x0,$r0lo,$h0lo,$h0lo
841 fsubd $y0,$two0, $y0 ! de-bias input
842 fmaddd $x0,$r0hi,$h0hi,$h0hi
843 fsubd $y1,$two32,$y1
844 fmaddd $x0,$r2lo,$h2lo,$h2lo
845 fsubd $y2,$two64,$y2
846 fmaddd $x0,$r2hi,$h2hi,$h2hi
847 fsubd $y3,$two96,$y3
848 fmaddd $x0,$r1lo,$h1lo,$h1lo
849 fmaddd $x0,$r1hi,$h1hi,$h1hi
850 fmaddd $x0,$r3lo,$h3lo,$h3lo
851 fmaddd $x0,$r3hi,$h3hi,$h3hi
852
853 bcc SIZE_T_CC,.Loop_fma
854 subcc $len,1,$len
855
856 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
857 faddd $h0lo,$two32,$c0lo
858 faddd $h0hi,$two32,$c0hi
859 faddd $h2lo,$two96,$c2lo
860 faddd $h2hi,$two96,$c2hi
861 faddd $h1lo,$two64,$c1lo
862 faddd $h1hi,$two64,$c1hi
863 faddd $h3lo,$two130,$c3lo
864 faddd $h3hi,$two130,$c3hi
865
866 fsubd $c0lo,$two32,$c0lo
867 fsubd $c0hi,$two32,$c0hi
868 fsubd $c2lo,$two96,$c2lo
869 fsubd $c2hi,$two96,$c2hi
870 fsubd $c1lo,$two64,$c1lo
871 fsubd $c1hi,$two64,$c1hi
872 fsubd $c3lo,$two130,$c3lo
873 fsubd $c3hi,$two130,$c3hi
874
875 fsubd $h1lo,$c1lo,$h1lo
876 fsubd $h1hi,$c1hi,$h1hi
877 fsubd $h3lo,$c3lo,$h3lo
878 fsubd $h3hi,$c3hi,$h3hi
879 fsubd $h2lo,$c2lo,$h2lo
880 fsubd $h2hi,$c2hi,$h2hi
881 fsubd $h0lo,$c0lo,$h0lo
882 fsubd $h0hi,$c0hi,$h0hi
883
884 faddd $h1lo,$c0lo,$h1lo
885 faddd $h1hi,$c0hi,$h1hi
886 faddd $h3lo,$c2lo,$h3lo
887 faddd $h3hi,$c2hi,$h3hi
888 faddd $h2lo,$c1lo,$h2lo
889 faddd $h2hi,$c1hi,$h2hi
890 fmaddd $five_two130,$c3lo,$h0lo,$h0lo
891 fmaddd $five_two130,$c3hi,$h0hi,$h0hi
892
893 faddd $h1lo,$h1hi,$x1
894 faddd $h3lo,$h3hi,$x3
895 faddd $h2lo,$h2hi,$x2
896 faddd $h0lo,$h0hi,$x0
897
898 faddd $x1,$two32,$x1 ! bias
899 faddd $x3,$two96,$x3
900 faddd $x2,$two64,$x2
901 faddd $x0,$two0, $x0
902
903 ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr
904
905 std $x1,[$ctx+8*1] ! store [biased] hash value
906 std $x3,[$ctx+8*3]
907 std $x2,[$ctx+8*2]
908 std $x0,[$ctx+8*0]
909
910 .Labort:
911 ret
912 restore
913 .type poly1305_blocks_fma,#function
914 .size poly1305_blocks_fma,.-poly1305_blocks_fma
915 ___
916 {
917 my ($mac,$nonce)=($inp,$len);
918
919 my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
920 ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
921
922 $code.=<<___;
923 .align 32
924 poly1305_emit_fma:
925 save %sp,-STACK_FRAME,%sp
926
927 ld [$ctx+8*0+0],$d0 ! load hash
928 ld [$ctx+8*0+4],$h0
929 ld [$ctx+8*1+0],$d1
930 ld [$ctx+8*1+4],$h1
931 ld [$ctx+8*2+0],$d2
932 ld [$ctx+8*2+4],$h2
933 ld [$ctx+8*3+0],$d3
934 ld [$ctx+8*3+4],$h3
935
936 sethi %hi(0xfff00000),$mask
937 andn $d0,$mask,$d0 ! mask exponent
938 andn $d1,$mask,$d1
939 andn $d2,$mask,$d2
940 andn $d3,$mask,$d3 ! can be partially reduced...
941 mov 3,$mask
942
943 srl $d3,2,$padbit ! ... so reduce
944 and $d3,$mask,$h4
945 andn $d3,$mask,$d3
946 add $padbit,$d3,$d3
947
948 addcc $d3,$h0,$h0
949 addccc $d0,$h1,$h1
950 addccc $d1,$h2,$h2
951 addccc $d2,$h3,$h3
952 addc %g0,$h4,$h4
953
954 addcc $h0,5,$d0 ! compare to modulus
955 addccc $h1,0,$d1
956 addccc $h2,0,$d2
957 addccc $h3,0,$d3
958 addc $h4,0,$mask
959
960 srl $mask,2,$mask ! did it carry/borrow?
961 neg $mask,$mask
962 sra $mask,31,$mask ! mask
963
964 andn $h0,$mask,$h0
965 and $d0,$mask,$d0
966 andn $h1,$mask,$h1
967 and $d1,$mask,$d1
968 or $d0,$h0,$h0
969 ld [$nonce+0],$d0 ! load nonce
970 andn $h2,$mask,$h2
971 and $d2,$mask,$d2
972 or $d1,$h1,$h1
973 ld [$nonce+4],$d1
974 andn $h3,$mask,$h3
975 and $d3,$mask,$d3
976 or $d2,$h2,$h2
977 ld [$nonce+8],$d2
978 or $d3,$h3,$h3
979 ld [$nonce+12],$d3
980
981 addcc $d0,$h0,$h0 ! accumulate nonce
982 addccc $d1,$h1,$h1
983 addccc $d2,$h2,$h2
984 addc $d3,$h3,$h3
985
986 stb $h0,[$mac+0] ! write little-endian result
987 srl $h0,8,$h0
988 stb $h1,[$mac+4]
989 srl $h1,8,$h1
990 stb $h2,[$mac+8]
991 srl $h2,8,$h2
992 stb $h3,[$mac+12]
993 srl $h3,8,$h3
994
995 stb $h0,[$mac+1]
996 srl $h0,8,$h0
997 stb $h1,[$mac+5]
998 srl $h1,8,$h1
999 stb $h2,[$mac+9]
1000 srl $h2,8,$h2
1001 stb $h3,[$mac+13]
1002 srl $h3,8,$h3
1003
1004 stb $h0,[$mac+2]
1005 srl $h0,8,$h0
1006 stb $h1,[$mac+6]
1007 srl $h1,8,$h1
1008 stb $h2,[$mac+10]
1009 srl $h2,8,$h2
1010 stb $h3,[$mac+14]
1011 srl $h3,8,$h3
1012
1013 stb $h0,[$mac+3]
1014 stb $h1,[$mac+7]
1015 stb $h2,[$mac+11]
1016 stb $h3,[$mac+15]
1017
1018 ret
1019 restore
1020 .type poly1305_emit_fma,#function
1021 .size poly1305_emit_fma,.-poly1305_emit_fma
1022 ___
1023 }
1024
1025 $code.=<<___;
1026 .align 64
1027 .Lconsts_fma:
1028 .word 0x43300000,0x00000000 ! 2^(52+0)
1029 .word 0x45300000,0x00000000 ! 2^(52+32)
1030 .word 0x47300000,0x00000000 ! 2^(52+64)
1031 .word 0x49300000,0x00000000 ! 2^(52+96)
1032 .word 0x4b500000,0x00000000 ! 2^(52+130)
1033
1034 .word 0x37f40000,0x00000000 ! 5/2^130
1035 .word 0,1<<30 ! fsr: truncate, no exceptions
1036
1037 .word 0x44300000,0x00000000 ! 2^(52+16+0)
1038 .word 0x46300000,0x00000000 ! 2^(52+16+32)
1039 .word 0x48300000,0x00000000 ! 2^(52+16+64)
1040 .word 0x4a300000,0x00000000 ! 2^(52+16+96)
1041 .word 0x3e300000,0x00000000 ! 2^(52+16+0-96)
1042 .word 0x40300000,0x00000000 ! 2^(52+16+32-96)
1043 .word 0x42300000,0x00000000 ! 2^(52+16+64-96)
1044 .asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
1045 .align 4
1046 ___
1047 }
1048 \f
1049 # Purpose of these subroutines is to explicitly encode VIS instructions,
1050 # so that one can compile the module without having to specify VIS
1051 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1052 # Idea is to reserve for option to produce "universal" binary and let
1053 # programmer detect if current CPU is VIS capable at run-time.
1054 sub unvis3 {
1055 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1056 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1057 my ($ref,$opf);
1058 my %visopf = ( "addxc" => 0x011,
1059 "addxccc" => 0x013,
1060 "umulxhi" => 0x016 );
1061
1062 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1063
1064 if ($opf=$visopf{$mnemonic}) {
1065 foreach ($rs1,$rs2,$rd) {
1066 return $ref if (!/%([goli])([0-9])/);
1067 $_=$bias{$1}+$2;
1068 }
1069
1070 return sprintf ".word\t0x%08x !%s",
1071 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1072 $ref;
1073 } else {
1074 return $ref;
1075 }
1076 }
1077
1078 sub unfma {
1079 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1080 my ($ref,$opf);
1081 my %fmaopf = ( "fmadds" => 0x1,
1082 "fmaddd" => 0x2,
1083 "fmsubs" => 0x5,
1084 "fmsubd" => 0x6 );
1085
1086 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1087
1088 if ($opf=$fmaopf{$mnemonic}) {
1089 foreach ($rs1,$rs2,$rs3,$rd) {
1090 return $ref if (!/%f([0-9]{1,2})/);
1091 $_=$1;
1092 if ($1>=32) {
1093 return $ref if ($1&1);
1094 # re-encode for upper double register addressing
1095 $_=($1|$1>>5)&31;
1096 }
1097 }
1098
1099 return sprintf ".word\t0x%08x !%s",
1100 0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1101 $ref;
1102 } else {
1103 return $ref;
1104 }
1105 }
1106
1107 foreach (split("\n",$code)) {
1108 s/\`([^\`]*)\`/eval $1/ge;
1109
1110 s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1111 &unvis3($1,$2,$3,$4)
1112 /ge or
1113 s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
1114 &unfma($1,$2,$3,$4,$5)
1115 /ge;
1116
1117 print $_,"\n";
1118 }
1119
1120 close STDOUT;