3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
13 # onward. There are three new instructions used here: umulxhi,
14 # addxc[cc] and initializing store. On T3 RSA private key operations
15 # are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
16 # lengths. This is without dedicated squaring procedure. On T4
17 # corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
18 # for reference purposes, because T4 has dedicated Montgomery
19 # multiplication and squaring *instructions* that deliver even more.
22 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
23 if ($bits==64) { $bias=2047; $frame=192; }
24 else { $bias=0; $frame=112; }
26 $code.=<<___
if ($bits==64);
27 .register
%g2,#scratch
28 .register
%g3,#scratch
31 .section
".text",#alloc,#execinstr
34 ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
35 (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
38 $rp="%o0"; # BN_ULONG *rp,
39 $ap="%o1"; # const BN_ULONG *ap,
40 $bp="%o2"; # const BN_ULONG *bp,
41 $np="%o3"; # const BN_ULONG *np,
42 $n0p="%o4"; # const BN_ULONG *n0,
43 $num="%o5"; # int num); # caller ensures that num is even
46 .globl bn_mul_mont_vis3
49 add
%sp, $bias, %g4 ! real top of stack
50 sll
$num, 2, $num ! size
in bytes
52 andn
%g5, 63, %g5 ! buffer size rounded up to
64 bytes
54 add
%g5, %g1, %g1 ! 3*buffer size
56 andn
%g1, 63, %g1 ! align at
64 byte
57 sub %g1, $frame, %g1 ! new top of stack
63 # +-------------------------------+<----- %sp
65 # +-------------------------------+<----- aligned at 64 bytes
67 # +-------------------------------+
70 # +-------------------------------+<----- aligned at 64 bytes
71 # | __int64 ap[1..0] | converted ap[]
72 # +-------------------------------+
73 # | __int64 np[1..0] | converted np[]
74 # +-------------------------------+
75 # | __int64 ap[3..2] |
78 # +-------------------------------+
79 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
80 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
83 ld
[$n0p+0], $t0 ! pull n0
[0..1] value
84 add
%sp, $bias+$frame, $tp
87 ld
[$bp+0], $t2 ! m0
=bp
[0]
93 ld
[$ap+0], $t0 ! ap
[0]
98 ld
[$ap+8], $t2 ! ap
[1]
103 stxa
$aj, [$anp]0xe2 ! converted ap
[0]
105 mulx
$aj, $m0, $lo0 ! ap
[0]*bp
[0]
106 umulxhi
$aj, $m0, $hi0
108 ld
[$np+0], $t0 ! np
[0]
113 ld
[$np+8], $t2 ! np
[1]
118 stx
$nj, [$anp+8] ! converted np
[0]
120 mulx
$lo0, $n0, $m1 ! "tp[0]"*n0
121 stx
$aj, [$anp+16] ! converted ap
[1]
123 mulx
$aj, $m0, $alo ! ap
[1]*bp
[0]
124 umulxhi
$aj, $m0, $aj ! ahi
=aj
126 mulx
$nj, $m1, $lo1 ! np
[0]*m1
127 umulxhi
$nj, $m1, $hi1
131 stx
$nj, [$anp+24] ! converted np
[1]
134 addcc
$lo0, $lo1, $lo1
135 addxc
%g0, $hi1, $hi1
137 mulx
$nj, $m1, $nlo ! np
[1]*m1
138 umulxhi
$nj, $m1, $nj ! nhi
=nj
141 sub $num, 24, $cnt ! cnt
=num
-3
145 ld
[$ap+0], $t0 ! ap
[j
]
146 addcc
$alo, $hi0, $lo0
153 stxa
$aj, [$anp]0xe2 ! converted ap
[j
]
155 ld
[$np+0], $t2 ! np
[j
]
156 addcc
$nlo, $hi1, $lo1
158 addxc
$nj, %g0, $hi1 ! nhi
=nj
162 mulx
$aj, $m0, $alo ! ap
[j
]*bp
[0]
164 umulxhi
$aj, $m0, $aj ! ahi
=aj
165 stx
$nj, [$anp+8] ! converted np
[j
]
166 add
$anp, 16, $anp ! anp
++
168 mulx
$nj, $m1, $nlo ! np
[j
]*m1
169 addcc
$lo0, $lo1, $lo1 ! np
[j
]*m1
+ap
[j
]*bp
[0]
170 umulxhi
$nj, $m1, $nj ! nhi
=nj
171 addxc
%g0, $hi1, $hi1
172 stxa
$lo1, [$tp]0xe2 ! tp
[j
-1]
173 add
$tp, 8, $tp ! tp
++
176 sub $cnt, 8, $cnt ! j
--
178 addcc
$alo, $hi0, $lo0
179 addxc
$aj, %g0, $hi0 ! ahi
=aj
181 addcc
$nlo, $hi1, $lo1
183 addcc
$lo0, $lo1, $lo1 ! np
[j
]*m1
+ap
[j
]*bp
[0]
184 addxc
%g0, $hi1, $hi1
185 stxa
$lo1, [$tp]0xe2 ! tp
[j
-1]
188 addcc
$hi0, $hi1, $hi1
189 addxc
%g0, %g0, $ovf ! upmost overflow bit
194 sub $num, 16, $i ! i
=num
-2
198 ld
[$bp+0], $t2 ! m0
=bp
[i
]
201 sub $anp, $num, $anp ! rewind
207 ldx
[$anp+0], $aj ! ap
[0]
209 ldx
[$anp+8], $nj ! np
[0]
211 mulx
$aj, $m0, $lo0 ! ap
[0]*bp
[i
]
212 ldx
[$tp], $tj ! tp
[0]
213 umulxhi
$aj, $m0, $hi0
214 ldx
[$anp+16], $aj ! ap
[1]
215 addcc
$lo0, $tj, $lo0 ! ap
[0]*bp
[i
]+tp
[0]
216 mulx
$aj, $m0, $alo ! ap
[1]*bp
[i
]
217 addxc
%g0, $hi0, $hi0
218 mulx
$lo0, $n0, $m1 ! tp
[0]*n0
219 umulxhi
$aj, $m0, $aj ! ahi
=aj
220 mulx
$nj, $m1, $lo1 ! np
[0]*m1
221 umulxhi
$nj, $m1, $hi1
222 ldx
[$anp+24], $nj ! np
[1]
224 addcc
$lo1, $lo0, $lo1
225 mulx
$nj, $m1, $nlo ! np
[1]*m1
226 addxc
%g0, $hi1, $hi1
227 umulxhi
$nj, $m1, $nj ! nhi
=nj
230 sub $num, 24, $cnt ! cnt
=num
-3
233 addcc
$alo, $hi0, $lo0
234 ldx
[$tp+8], $tj ! tp
[j
]
235 addxc
$aj, %g0, $hi0 ! ahi
=aj
236 ldx
[$anp+0], $aj ! ap
[j
]
237 addcc
$nlo, $hi1, $lo1
238 mulx
$aj, $m0, $alo ! ap
[j
]*bp
[i
]
239 addxc
$nj, %g0, $hi1 ! nhi
=nj
240 ldx
[$anp+8], $nj ! np
[j
]
242 umulxhi
$aj, $m0, $aj ! ahi
=aj
243 addcc
$lo0, $tj, $lo0 ! ap
[j
]*bp
[i
]+tp
[j
]
244 mulx
$nj, $m1, $nlo ! np
[j
]*m1
245 addxc
%g0, $hi0, $hi0
246 umulxhi
$nj, $m1, $nj ! nhi
=nj
247 addcc
$lo1, $lo0, $lo1 ! np
[j
]*m1
+ap
[j
]*bp
[i
]+tp
[j
]
248 addxc
%g0, $hi1, $hi1
249 stx
$lo1, [$tp] ! tp
[j
-1]
251 brnz
,pt
$cnt, .Linner
254 ldx
[$tp+8], $tj ! tp
[j
]
255 addcc
$alo, $hi0, $lo0
256 addxc
$aj, %g0, $hi0 ! ahi
=aj
257 addcc
$lo0, $tj, $lo0 ! ap
[j
]*bp
[i
]+tp
[j
]
258 addxc
%g0, $hi0, $hi0
260 addcc
$nlo, $hi1, $lo1
261 addxc
$nj, %g0, $hi1 ! nhi
=nj
262 addcc
$lo1, $lo0, $lo1 ! np
[j
]*m1
+ap
[j
]*bp
[i
]+tp
[j
]
263 addxc
%g0, $hi1, $hi1
264 stx
$lo1, [$tp] ! tp
[j
-1]
266 subcc
%g0, $ovf, %g0 ! move upmost overflow to CCR
.xcc
267 addxccc
$hi1, $hi0, $hi1
275 sub $anp, $num, $anp ! rewind
279 subcc
$num, 8, $cnt ! cnt
=num
-1 and clear CCR
.xcc
287 subccc
$tj, $nj, $t2 ! tp
[j
]-np
[j
]
292 st
$t2, [$rp-4] ! reverse order
297 sub $anp, $num, $anp ! rewind
302 subc
$ovf, %g0, $ovf ! handle upmost overflow bit
305 or $np, $ap, $ap ! ap
=borrow?tp
:rp
310 .Lcopy
: ! copy
or in-place refresh
316 stx
%g0, [$anp] ! zap
319 st
$t3, [$rp+0] ! flip order
328 .type bn_mul_mont_vis3
, #function
329 .size bn_mul_mont_vis3
, .-bn_mul_mont_vis3
330 .asciz
"Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
334 # Purpose of these subroutines is to explicitly encode VIS instructions,
335 # so that one can compile the module without having to specify VIS
336 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
337 # Idea is to reserve for option to produce "universal" binary and let
338 # programmer detect if current CPU is VIS capable at run-time.
340 my ($mnemonic,$rs1,$rs2,$rd)=@_;
341 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
343 my %visopf = ( "addxc" => 0x011,
345 "umulxhi" => 0x016 );
347 $ref = "$mnemonic\t$rs1,$rs2,$rd";
349 if ($opf=$visopf{$mnemonic}) {
350 foreach ($rs1,$rs2,$rd) {
351 return $ref if (!/%([goli])([0-9])/);
355 return sprintf ".word\t0x%08x !%s",
356 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
363 foreach (split("\n",$code)) {
364 s/\`([^\`]*)\`/eval $1/ge;
366 s
/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/