2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements Poly1305 hash for SPARCv9, vanilla, as well
18 # as VIS3 and FMA extensions.
22 # Numbers are cycles per processed byte with poly1305_blocks alone.
26 # UltraSPARC III 12.3(**)
28 # SPARC T4 1.70(***) 6.55
31 # (*) Comparison to compiler-generated code is really problematic,
32 # because latter's performance varies too much depending on too
33 # many variables. For example, one can measure from 5x to 15x
34 # improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
35 # unfair comparison, because compiler doesn't use VIS3, but
36 # given same initial conditions coefficient varies from 3x to 9x.
37 # (**) Pre-III performance should be even worse; floating-point
38 # performance for UltraSPARC I-IV on the other hand is reported
39 # to be 4.25 for hand-coded assembly, but they are just too old
41 # (***) Multi-process benchmark saturates at ~12.5x single-process
42 # result on 8-core processor, or ~21GBps per 2.85GHz socket.
45 open STDOUT
,">$output";
47 my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5));
48 my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7));
49 my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7));
50 my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4));
53 open STDOUT
,">$stdout";
56 #include "sparc_arch.h"
59 .register
%g2,#scratch
60 .register
%g3,#scratch
67 #define LOCALS (STACK_BIAS+STACK_FRAME)
69 .section
".text",#alloc,#execinstr
78 save
%sp,-STACK_FRAME
-16,%sp
81 SPARC_LOAD_ADDRESS
(OPENSSL_sparcv9cap_P
,%g1)
84 and %g1,SPARCV9_FMADD
|SPARCV9_VIS3
,%g1
86 be
.Lpoly1305_init_fma
90 stx
%g0,[$ctx+8] ! zero hash value
94 and $inp,7,$shr ! alignment factor
99 sethi
%hi(0x0ffffffc),$t0
101 or $t0,%lo(0x0ffffffc),$t0
104 or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc
105 or $t1,3,$t0 ! 0x0ffffffc0fffffff
107 ldxa
[$inp+%g0]0x88,$h0 ! load little
-endian key
108 brz
,pt
$shr,.Lkey_aligned
109 ldxa
[$inp+$h1]0x88,$h1
111 ldxa
[$inp+$h2]0x88,$h2
122 stx
$h0,[$ctx+32+0] ! store key
125 andcc
%g1,SPARCV9_VIS3
,%g0
130 add
%o7,poly1305_blocks_vis3
-1b
,%o7
132 add
%o7,poly1305_emit
-poly1305_blocks_vis3
,%o5
134 STPTR
%o5,[%i2+SIZE_T
]
137 restore
%g0,1,%o0 ! return 1
141 restore
%g0,%g0,%o0 ! return 0
142 .type poly1305_init
,#function
143 .size poly1305_init
,.-poly1305_init
145 .globl poly1305_blocks
148 save
%sp,-STACK_FRAME
,%sp
151 brz
,pn
$len,.Lno_data
154 ld
[$ctx+32+0],$r1 ! load key
159 ld
[$ctx+0],$h1 ! load hash value
165 and $inp,7,$shr ! alignment factor
180 ldxa
[$inp+%g0]0x88,$d0 ! load little
-endian input
181 brz
,pt
$shr,.Linp_aligned
182 ldxa
[$inp+$d1]0x88,$d1
184 ldxa
[$inp+$d2]0x88,$d2
194 addcc
$d0,$h0,$h0 ! accumulate input
254 srl
$h4,2,$t0 ! final reduction step
266 st
$h1,[$ctx+0] ! store hash value
275 .type poly1305_blocks
,#function
276 .size poly1305_blocks
,.-poly1305_blocks
278 ########################################################################
279 # VIS3 has umulxhi and addxc...
281 my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
282 my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
286 poly1305_blocks_vis3
:
287 save
%sp,-STACK_FRAME
,%sp
290 brz
,pn
$len,.Lno_data
293 ldx
[$ctx+32+0],$R0 ! load key
296 ldx
[$ctx+0],$H0 ! load hash value
300 and $inp,7,$shr ! alignment factor
312 ldxa
[$inp+%g0]0x88,$D0 ! load little
-endian input
313 brz
,pt
$shr,.Linp_aligned_vis3
314 ldxa
[$inp+$r1]0x88,$D1
316 ldxa
[$inp+$r2]0x88,$D2
325 addcc
$D0,$H0,$H0 ! accumulate input
330 mulx
$R0,$H0,$D0 ! r0
*h0
331 addxc
$padbit,$H2,$H2
333 mulx
$S1,$H1,$T0 ! s1
*h1
336 mulx
$R1,$H0,$T0 ! r1
*h0
340 mulx
$R0,$H1,$T0 ! r0
*h1
344 mulx
$S1,$H2,$T0 ! s1
*h2
346 mulx
$R0,$H2,$T1 ! r0
*h2
350 srlx
$D2,2,$T0 ! final reduction step
357 brnz
,pt
$len,.Loop_vis3
360 stx
$H0,[$ctx+0] ! store hash value
366 .type poly1305_blocks_vis3
,#function
367 .size poly1305_blocks_vis3
,.-poly1305_blocks_vis3
370 my ($mac,$nonce) = ($inp,$len);
376 save
%sp,-STACK_FRAME
,%sp
378 ld
[$ctx+0],$h1 ! load hash value
384 addcc
$h0,5,$r0 ! compare to modulus
389 andcc
$h4,4,%g0 ! did it carry
/borrow?
392 ld
[$nonce+0],$r0 ! load nonce
400 addcc
$r0,$h0,$h0 ! accumulate nonce
406 stb
$h0,[$mac+0] ! store little
-endian result
439 .type poly1305_emit
,#function
440 .size poly1305_emit
,.-poly1305_emit
444 my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
445 my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
446 my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
449 my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
450 $two0,$two32,$two64,$two96,$two130,$five_two130,
451 $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
452 $s2lo,$s2hi,$s3lo,$s3hi,
453 $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
455 my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
456 my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
457 my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
462 save
%sp,-STACK_FRAME
-16,%sp
467 add
%o7,.Lconsts_fma
-1b
,%o7
469 ldd
[%o7+8*0],$two0 ! load constants
473 ldd
[%o7+8*5],$five_two130
475 std
$two0,[$ctx+8*0] ! initial hash value
, biased
0
476 std
$two32,[$ctx+8*1]
477 std
$two64,[$ctx+8*2]
478 std
$two96,[$ctx+8*3]
480 brz
,pn
$inp,.Lno_key_fma
483 stx
%fsr,[%sp+LOCALS
] ! save original
%fsr
484 ldx
[%o7+8*6],%fsr ! load new
%fsr
486 std
$two0,[$ctx+8*4] ! key
"template"
487 std
$two32,[$ctx+8*5]
488 std
$two64,[$ctx+8*6]
489 std
$two96,[$ctx+8*7]
492 andn
$inp,7,$inp ! align pointer
498 ldxa
[$inp+%g0]0x88,$in0 ! load little
-endian key
499 ldxa
[$inp+$i1]0x88,$in2
501 brz
$shr,.Lkey_aligned_fma
502 sethi
%hi(0xf0000000),$i1 ! 0xf0000000
504 ldxa
[$inp+$i2]0x88,$in4
506 srlx
$in0,$shr,$in0 ! align data
514 or $i1,3,$i2 ! 0xf0000003
516 andn
$in0,$i1,$in0 ! &=0x0fffffff
517 andn
$in1,$i2,$in1 ! &=0x0ffffffc
522 st
$in0,[$ctx+`8*4+4`] ! fill
"template"
523 st
$in1,[$ctx+`8*5+4`]
524 st
$in2,[$ctx+`8*6+4`]
525 st
$in3,[$ctx+`8*7+4`]
527 ldd
[$ctx+8*4],$h0lo ! load
[biased
] key
532 fsubd
$h0lo,$two0, $h0lo ! r0
533 ldd
[%o7+8*7],$two0 ! more constants
534 fsubd
$h1lo,$two32,$h1lo ! r1
536 fsubd
$h2lo,$two64,$h2lo ! r2
538 fsubd
$h3lo,$two96,$h3lo ! r3
539 ldd
[%o7+8*10],$two96
541 fmuld
$five_two130,$h1lo,$s1lo ! s1
542 fmuld
$five_two130,$h2lo,$s2lo ! s2
543 fmuld
$five_two130,$h3lo,$s3lo ! s3
545 faddd
$h0lo,$two0, $h0hi
546 faddd
$h1lo,$two32,$h1hi
547 faddd
$h2lo,$two64,$h2hi
548 faddd
$h3lo,$two96,$h3hi
550 fsubd
$h0hi,$two0, $h0hi
551 ldd
[%o7+8*11],$two0 ! more constants
552 fsubd
$h1hi,$two32,$h1hi
553 ldd
[%o7+8*12],$two32
554 fsubd
$h2hi,$two64,$h2hi
555 ldd
[%o7+8*13],$two64
556 fsubd
$h3hi,$two96,$h3hi
558 fsubd
$h0lo,$h0hi,$h0lo
559 std
$h0hi,[$ctx+8*5] ! r0hi
560 fsubd
$h1lo,$h1hi,$h1lo
561 std
$h1hi,[$ctx+8*7] ! r1hi
562 fsubd
$h2lo,$h2hi,$h2lo
563 std
$h2hi,[$ctx+8*9] ! r2hi
564 fsubd
$h3lo,$h3hi,$h3lo
565 std
$h3hi,[$ctx+8*11] ! r3hi
567 faddd
$s1lo,$two0, $s1hi
568 faddd
$s2lo,$two32,$s2hi
569 faddd
$s3lo,$two64,$s3hi
571 fsubd
$s1hi,$two0, $s1hi
572 fsubd
$s2hi,$two32,$s2hi
573 fsubd
$s3hi,$two64,$s3hi
575 fsubd
$s1lo,$s1hi,$s1lo
576 fsubd
$s2lo,$s2hi,$s2lo
577 fsubd
$s3lo,$s3hi,$s3lo
579 ldx
[%sp+LOCALS
],%fsr ! restore
%fsr
581 std
$h0lo,[$ctx+8*4] ! r0lo
582 std
$h1lo,[$ctx+8*6] ! r1lo
583 std
$h2lo,[$ctx+8*8] ! r2lo
584 std
$h3lo,[$ctx+8*10] ! r3lo
586 std
$s1hi,[$ctx+8*13]
587 std
$s2hi,[$ctx+8*15]
588 std
$s3hi,[$ctx+8*17]
590 std
$s1lo,[$ctx+8*12]
591 std
$s2lo,[$ctx+8*14]
592 std
$s3lo,[$ctx+8*16]
594 add
%o7,poly1305_blocks_fma
-.Lconsts_fma
,%o0
595 add
%o7,poly1305_emit_fma
-.Lconsts_fma
,%o1
597 STPTR
%o1,[%i2+SIZE_T
]
600 restore
%g0,1,%o0 ! return 1
604 restore
%g0,%g0,%o0 ! return 0
605 .type poly1305_init_fma
,#function
606 .size poly1305_init_fma
,.-poly1305_init_fma
610 save
%sp,-STACK_FRAME
-48,%sp
617 add
%o7,.Lconsts_fma
-1b
,%o7
619 ldd
[%o7+8*0],$two0 ! load constants
623 ldd
[%o7+8*4],$two130
624 ldd
[%o7+8*5],$five_two130
626 ldd
[$ctx+8*0],$h0lo ! load
[biased
] hash value
631 std
$two0,[%sp+LOCALS
+8*0] ! input
"template"
632 sethi
%hi((1023+52+96)<<20),$in3
633 std
$two32,[%sp+LOCALS
+8*1]
635 std
$two64,[%sp+LOCALS
+8*2]
636 st
$in3,[%sp+LOCALS
+8*3]
639 andn
$inp,7,$inp ! align pointer
645 ldxa
[$inp+%g0]0x88,$in0 ! load little
-endian input
646 brz
$shr,.Linp_aligned_fma
647 ldxa
[$inp+$i1]0x88,$in2
649 ldxa
[$inp+$step]0x88,$in4
652 srlx
$in0,$shr,$in0 ! align data
657 srlx
$in4,$shr,$in4 ! pre
-shift
664 add
$step,$inp,$inp ! conditional advance
666 st
$in0,[%sp+LOCALS
+8*0+4] ! fill
"template"
667 st
$in1,[%sp+LOCALS
+8*1+4]
668 st
$in2,[%sp+LOCALS
+8*2+4]
669 st
$in3,[%sp+LOCALS
+8*3+4]
671 ldd
[$ctx+8*4],$r0lo ! load key
677 ldd
[$ctx+8*10],$r3lo
678 ldd
[$ctx+8*11],$r3hi
679 ldd
[$ctx+8*12],$s1lo
680 ldd
[$ctx+8*13],$s1hi
681 ldd
[$ctx+8*14],$s2lo
682 ldd
[$ctx+8*15],$s2hi
683 ldd
[$ctx+8*16],$s3lo
684 ldd
[$ctx+8*17],$s3hi
686 stx
%fsr,[%sp+LOCALS
+8*4] ! save original
%fsr
687 ldx
[%o7+8*6],%fsr ! load new
%fsr
692 ldd
[%sp+LOCALS
+8*0],$x0 ! load biased input
693 ldd
[%sp+LOCALS
+8*1],$x1
694 ldd
[%sp+LOCALS
+8*2],$x2
695 ldd
[%sp+LOCALS
+8*3],$x3
697 fsubd
$h0lo,$two0, $h0lo ! de
-bias hash value
698 fsubd
$h1lo,$two32,$h1lo
699 ldxa
[$inp+%g0]0x88,$in0 ! modulo
-scheduled input load
700 fsubd
$h2lo,$two64,$h2lo
701 fsubd
$h3lo,$two96,$h3lo
702 ldxa
[$inp+$i1]0x88,$in2
704 fsubd
$x0,$two0, $x0 ! de
-bias input
709 brz
$shr,.Linp_aligned_fma2
710 add
$step,$inp,$inp ! conditional advance
712 sllx
$in0,$shl,$in1 ! align data
716 srlx
$in2,$shr,$in4 ! pre
-shift
722 faddd
$h0lo,$x0,$x0 ! accumulate input
723 stw
$in0,[%sp+LOCALS
+8*0+4]
725 stw
$in1,[%sp+LOCALS
+8*1+4]
727 stw
$in2,[%sp+LOCALS
+8*2+4]
729 stw
$in3,[%sp+LOCALS
+8*3+4]
736 ldxa
[$inp+%g0]0x88,$in0 ! modulo
-scheduled input load
737 ldxa
[$inp+$i1]0x88,$in2
740 faddd
$y0,$h0lo,$h0lo ! accumulate input
741 faddd
$y1,$h0hi,$h0hi
742 faddd
$y2,$h2lo,$h2lo
743 faddd
$y3,$h2hi,$h2hi
745 brz
,pn
$shr,.Linp_aligned_fma3
746 add
$step,$inp,$inp ! conditional advance
748 sllx
$in0,$shl,$in1 ! align data
752 srlx
$in2,$shr,$in4 ! pre
-shift
756 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base
2^48 -> base
2^32
757 faddd
$two64,$h1lo,$c1lo
759 faddd
$two64,$h1hi,$c1hi
761 faddd
$two130,$h3lo,$c3lo
762 st
$in0,[%sp+LOCALS
+8*0+4] ! fill
"template"
763 faddd
$two130,$h3hi,$c3hi
764 st
$in1,[%sp+LOCALS
+8*1+4]
765 faddd
$two32,$h0lo,$c0lo
766 st
$in2,[%sp+LOCALS
+8*2+4]
767 faddd
$two32,$h0hi,$c0hi
768 st
$in3,[%sp+LOCALS
+8*3+4]
769 faddd
$two96,$h2lo,$c2lo
770 faddd
$two96,$h2hi,$c2hi
772 fsubd
$c1lo,$two64,$c1lo
773 fsubd
$c1hi,$two64,$c1hi
774 fsubd
$c3lo,$two130,$c3lo
775 fsubd
$c3hi,$two130,$c3hi
776 fsubd
$c0lo,$two32,$c0lo
777 fsubd
$c0hi,$two32,$c0hi
778 fsubd
$c2lo,$two96,$c2lo
779 fsubd
$c2hi,$two96,$c2hi
781 fsubd
$h1lo,$c1lo,$h1lo
782 fsubd
$h1hi,$c1hi,$h1hi
783 fsubd
$h3lo,$c3lo,$h3lo
784 fsubd
$h3hi,$c3hi,$h3hi
785 fsubd
$h2lo,$c2lo,$h2lo
786 fsubd
$h2hi,$c2hi,$h2hi
787 fsubd
$h0lo,$c0lo,$h0lo
788 fsubd
$h0hi,$c0hi,$h0hi
790 faddd
$h1lo,$c0lo,$h1lo
791 faddd
$h1hi,$c0hi,$h1hi
792 faddd
$h3lo,$c2lo,$h3lo
793 faddd
$h3hi,$c2hi,$h3hi
794 faddd
$h2lo,$c1lo,$h2lo
795 faddd
$h2hi,$c1hi,$h2hi
796 fmaddd
$five_two130,$c3lo,$h0lo,$h0lo
797 fmaddd
$five_two130,$c3hi,$h0hi,$h0hi
799 faddd
$h1lo,$h1hi,$x1
800 ldd
[$ctx+8*12],$s1lo ! reload constants
801 faddd
$h3lo,$h3hi,$x3
802 ldd
[$ctx+8*13],$s1hi
803 faddd
$h2lo,$h2hi,$x2
804 ldd
[$ctx+8*10],$r3lo
805 faddd
$h0lo,$h0hi,$x0
806 ldd
[$ctx+8*11],$r3hi
809 fmuld
$x1,$s3lo,$h0lo
810 fmuld
$x1,$s3hi,$h0hi
811 fmuld
$x1,$r1lo,$h2lo
812 fmuld
$x1,$r1hi,$h2hi
813 fmuld
$x1,$r0lo,$h1lo
814 fmuld
$x1,$r0hi,$h1hi
815 fmuld
$x1,$r2lo,$h3lo
816 fmuld
$x1,$r2hi,$h3hi
818 fmaddd
$x3,$s1lo,$h0lo,$h0lo
819 fmaddd
$x3,$s1hi,$h0hi,$h0hi
820 fmaddd
$x3,$s3lo,$h2lo,$h2lo
821 fmaddd
$x3,$s3hi,$h2hi,$h2hi
822 fmaddd
$x3,$s2lo,$h1lo,$h1lo
823 fmaddd
$x3,$s2hi,$h1hi,$h1hi
824 fmaddd
$x3,$r0lo,$h3lo,$h3lo
825 fmaddd
$x3,$r0hi,$h3hi,$h3hi
827 fmaddd
$x2,$s2lo,$h0lo,$h0lo
828 fmaddd
$x2,$s2hi,$h0hi,$h0hi
829 fmaddd
$x2,$r0lo,$h2lo,$h2lo
830 fmaddd
$x2,$r0hi,$h2hi,$h2hi
831 fmaddd
$x2,$s3lo,$h1lo,$h1lo
832 ldd
[%sp+LOCALS
+8*0],$y0 ! load
[biased
] input
833 fmaddd
$x2,$s3hi,$h1hi,$h1hi
834 ldd
[%sp+LOCALS
+8*1],$y1
835 fmaddd
$x2,$r1lo,$h3lo,$h3lo
836 ldd
[%sp+LOCALS
+8*2],$y2
837 fmaddd
$x2,$r1hi,$h3hi,$h3hi
838 ldd
[%sp+LOCALS
+8*3],$y3
840 fmaddd
$x0,$r0lo,$h0lo,$h0lo
841 fsubd
$y0,$two0, $y0 ! de
-bias input
842 fmaddd
$x0,$r0hi,$h0hi,$h0hi
844 fmaddd
$x0,$r2lo,$h2lo,$h2lo
846 fmaddd
$x0,$r2hi,$h2hi,$h2hi
848 fmaddd
$x0,$r1lo,$h1lo,$h1lo
849 fmaddd
$x0,$r1hi,$h1hi,$h1hi
850 fmaddd
$x0,$r3lo,$h3lo,$h3lo
851 fmaddd
$x0,$r3hi,$h3hi,$h3hi
853 bcc SIZE_T_CC
,.Loop_fma
856 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base
2^48 -> base
2^32
857 faddd
$h0lo,$two32,$c0lo
858 faddd
$h0hi,$two32,$c0hi
859 faddd
$h2lo,$two96,$c2lo
860 faddd
$h2hi,$two96,$c2hi
861 faddd
$h1lo,$two64,$c1lo
862 faddd
$h1hi,$two64,$c1hi
863 faddd
$h3lo,$two130,$c3lo
864 faddd
$h3hi,$two130,$c3hi
866 fsubd
$c0lo,$two32,$c0lo
867 fsubd
$c0hi,$two32,$c0hi
868 fsubd
$c2lo,$two96,$c2lo
869 fsubd
$c2hi,$two96,$c2hi
870 fsubd
$c1lo,$two64,$c1lo
871 fsubd
$c1hi,$two64,$c1hi
872 fsubd
$c3lo,$two130,$c3lo
873 fsubd
$c3hi,$two130,$c3hi
875 fsubd
$h1lo,$c1lo,$h1lo
876 fsubd
$h1hi,$c1hi,$h1hi
877 fsubd
$h3lo,$c3lo,$h3lo
878 fsubd
$h3hi,$c3hi,$h3hi
879 fsubd
$h2lo,$c2lo,$h2lo
880 fsubd
$h2hi,$c2hi,$h2hi
881 fsubd
$h0lo,$c0lo,$h0lo
882 fsubd
$h0hi,$c0hi,$h0hi
884 faddd
$h1lo,$c0lo,$h1lo
885 faddd
$h1hi,$c0hi,$h1hi
886 faddd
$h3lo,$c2lo,$h3lo
887 faddd
$h3hi,$c2hi,$h3hi
888 faddd
$h2lo,$c1lo,$h2lo
889 faddd
$h2hi,$c1hi,$h2hi
890 fmaddd
$five_two130,$c3lo,$h0lo,$h0lo
891 fmaddd
$five_two130,$c3hi,$h0hi,$h0hi
893 faddd
$h1lo,$h1hi,$x1
894 faddd
$h3lo,$h3hi,$x3
895 faddd
$h2lo,$h2hi,$x2
896 faddd
$h0lo,$h0hi,$x0
898 faddd
$x1,$two32,$x1 ! bias
903 ldx
[%sp+LOCALS
+8*4],%fsr ! restore saved
%fsr
905 std
$x1,[$ctx+8*1] ! store
[biased
] hash value
913 .type poly1305_blocks_fma
,#function
914 .size poly1305_blocks_fma
,.-poly1305_blocks_fma
917 my ($mac,$nonce)=($inp,$len);
919 my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
920 ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
925 save
%sp,-STACK_FRAME
,%sp
927 ld
[$ctx+8*0+0],$d0 ! load hash
936 sethi
%hi(0xfff00000),$mask
937 andn
$d0,$mask,$d0 ! mask exponent
940 andn
$d3,$mask,$d3 ! can be partially reduced
...
943 srl
$d3,2,$padbit ! ... so reduce
954 addcc
$h0,5,$d0 ! compare to modulus
960 srl
$mask,2,$mask ! did it carry
/borrow?
962 sra
$mask,31,$mask ! mask
969 ld
[$nonce+0],$d0 ! load nonce
981 addcc
$d0,$h0,$h0 ! accumulate nonce
986 stb
$h0,[$mac+0] ! write little
-endian result
1020 .type poly1305_emit_fma
,#function
1021 .size poly1305_emit_fma
,.-poly1305_emit_fma
1028 .word
0x43300000,0x00000000 ! 2^(52+0)
1029 .word
0x45300000,0x00000000 ! 2^(52+32)
1030 .word
0x47300000,0x00000000 ! 2^(52+64)
1031 .word
0x49300000,0x00000000 ! 2^(52+96)
1032 .word
0x4b500000,0x00000000 ! 2^(52+130)
1034 .word
0x37f40000,0x00000000 ! 5/2^130
1035 .word
0,1<<30 ! fsr
: truncate, no exceptions
1037 .word
0x44300000,0x00000000 ! 2^(52+16+0)
1038 .word
0x46300000,0x00000000 ! 2^(52+16+32)
1039 .word
0x48300000,0x00000000 ! 2^(52+16+64)
1040 .word
0x4a300000,0x00000000 ! 2^(52+16+96)
1041 .word
0x3e300000,0x00000000 ! 2^(52+16+0-96)
1042 .word
0x40300000,0x00000000 ! 2^(52+16+32-96)
1043 .word
0x42300000,0x00000000 ! 2^(52+16+64-96)
1044 .asciz
"Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
1049 # Purpose of these subroutines is to explicitly encode VIS instructions,
1050 # so that one can compile the module without having to specify VIS
1051 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1052 # Idea is to reserve for option to produce "universal" binary and let
1053 # programmer detect if current CPU is VIS capable at run-time.
1055 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1056 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1058 my %visopf = ( "addxc" => 0x011,
1060 "umulxhi" => 0x016 );
1062 $ref = "$mnemonic\t$rs1,$rs2,$rd";
1064 if ($opf=$visopf{$mnemonic}) {
1065 foreach ($rs1,$rs2,$rd) {
1066 return $ref if (!/%([goli])([0-9])/);
1070 return sprintf ".word\t0x%08x !%s",
1071 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1079 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1081 my %fmaopf = ( "fmadds" => 0x1,
1086 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1088 if ($opf=$fmaopf{$mnemonic}) {
1089 foreach ($rs1,$rs2,$rs3,$rd) {
1090 return $ref if (!/%f([0-9]{1,2})/);
1093 return $ref if ($1&1);
1094 # re-encode for upper double register addressing
1099 return sprintf ".word\t0x%08x !%s",
1100 0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1107 foreach (split("\n",$code)) {
1108 s/\`([^\`]*)\`/eval $1/ge;
1110 s
/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1111 &unvis3
($1,$2,$3,$4)
1113 s
/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
1114 &unfma
($1,$2,$3,$4,$5)