]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/sparcv9a-mont.pl
Unified - adapt the generation of bignum assembler to use GENERATE
[thirdparty/openssl.git] / crypto / bn / asm / sparcv9a-mont.pl
CommitLineData
bcb43bb3
AP
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
2e21922e
AP
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
bcb43bb3
AP
8# ====================================================================
9
aa2be094
AP
10# October 2005
11#
bcb43bb3
AP
12# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
13# Because unlike integer multiplier, which simply stalls whole CPU,
14# FPU is fully pipelined and can effectively emit 48 bit partial
15# product every cycle. Why not blended SPARC v9? One can argue that
16# making this module dependent on UltraSPARC VIS extension limits its
a4d729f3
AP
17# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
18# implementations from compatibility matrix. But the rest, whole Sun
19# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
20# VIS extension instructions used in this module. This is considered
73b979e6
AP
21# good enough to not care about HAL SPARC64 users [if any] who have
22# integer-only pure SPARCv9 module to "fall down" to.
bcb43bb3
AP
23
24# USI&II cores currently exhibit uniform 2x improvement [over pre-
25# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
26# performance improves few percents for shorter keys and worsens few
aa2be094 27# percents for longer keys. This is because USIII integer multiplier
bcb43bb3
AP
28# is >3x faster than USI&II one, which is harder to match [but see
29# TODO list below]. It should also be noted that SPARC64 V features
30# out-of-order execution, which *might* mean that integer multiplier
a4d729f3
AP
31# is pipelined, which in turn *might* be impossible to match... On
32# additional note, SPARC64 V implements FP Multiply-Add instruction,
33# which is perfectly usable in this context... In other words, as far
73b979e6 34# as Fujitsu SPARC64 V goes, talk to the author:-)
aa2be094 35
a00e414f
AP
36# The implementation implies following "non-natural" limitations on
37# input arguments:
aa2be094
AP
38# - num may not be less than 4;
39# - num has to be even;
aa2be094
AP
40# Failure to meet either condition has no fatal effects, simply
41# doesn't give any performance gain.
42
bcb43bb3 43# TODO:
bcb43bb3
AP
44# - modulo-schedule inner loop for better performance (on in-order
45# execution core such as UltraSPARC this shall result in further
46# noticeable(!) improvement);
47# - dedicated squaring procedure[?];
48
2e21922e
AP
49######################################################################
50# November 2006
51#
52# Modulo-scheduled inner loops allow to interleave floating point and
53# integer instructions and minimize Read-After-Write penalties. This
54# results in *further* 20-50% perfromance improvement [depending on
55# key length, more for longer keys] on USI&II cores and 30-80% - on
56# USIII&IV.
57
6bd7a4d9
RL
58$output = pop;
59open STDOUT,">$output";
60
a00e414f 61$fname="bn_mul_mont_fpu";
bcb43bb3 62$bits=32;
3b4a0225 63for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
bcb43bb3
AP
64
65if ($bits==64) {
66 $bias=2047;
67 $frame=192;
68} else {
69 $bias=0;
70 $frame=128; # 96 rounded up to largest known cache-line
71}
72$locals=64;
73
74# In order to provide for 32-/64-bit ABI duality, I keep integers wider
75# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
76# exclusively for pointers, indexes and other small values...
77# int bn_mul_mont(
78$rp="%i0"; # BN_ULONG *rp,
79$ap="%i1"; # const BN_ULONG *ap,
80$bp="%i2"; # const BN_ULONG *bp,
81$np="%i3"; # const BN_ULONG *np,
4d524040 82$n0="%i4"; # const BN_ULONG *n0,
bcb43bb3
AP
83$num="%i5"; # int num);
84
aa2be094 85$tp="%l0"; # t[num]
bcb43bb3
AP
86$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
87$ap_h="%l2"; # to these four vectors as double-precision FP values.
88$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
89$np_h="%l4"; # loop and L1-cache aliasing is minimized...
90$i="%l5";
91$j="%l6";
92$mask="%l7"; # 16-bit mask, 0xffff
93
aa2be094
AP
94$n0="%g4"; # reassigned(!) to "64-bit" register
95$carry="%i4"; # %i4 reused(!) for a carry bit
bcb43bb3
AP
96
97# FP register naming chart
98#
99# ..HILO
100# dcba
101# --------
102# LOa
103# LOb
104# LOc
105# LOd
106# HIa
107# HIb
108# HIc
109# HId
110# ..a
111# ..b
112$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
113$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
114$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
115$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
116
117$dota="%f24"; $dotb="%f26";
118
119$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
120$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
121$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
122$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
123
124$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
125
126$code=<<___;
bcb43bb3
AP
127.section ".text",#alloc,#execinstr
128
129.global $fname
130.align 32
131$fname:
aa2be094 132 save %sp,-$frame-$locals,%sp
6df8c74d 133
aa2be094
AP
134 cmp $num,4
135 bl,a,pn %icc,.Lret
136 clr %i0
137 andcc $num,1,%g0 ! $num has to be even...
138 bnz,a,pn %icc,.Lret
139 clr %i0 ! signal "unsupported input value"
760e3535 140
aa2be094 141 srl $num,1,$num
760e3535 142 sethi %hi(0xffff),$mask
aa2be094 143 ld [%i4+0],$n0 ! $n0 reassigned, remember?
760e3535 144 or $mask,%lo(0xffff),$mask
aa2be094
AP
145 ld [%i4+4],%o0
146 sllx %o0,32,%o0
147 or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
6df8c74d 148
aa2be094 149 sll $num,3,$num ! num*=8
bcb43bb3
AP
150
151 add %sp,$bias,%o0 ! real top of stack
152 sll $num,2,%o1
153 add %o1,$num,%o1 ! %o1=num*5
154 sub %o0,%o1,%o0
bcb43bb3 155 and %o0,-2048,%o0 ! optimize TLB utilization
aa2be094 156 sub %o0,$bias,%sp ! alloca(5*num*8)
bcb43bb3 157
aa2be094 158 rd %asi,%o7 ! save %asi
bcb43bb3
AP
159 add %sp,$bias+$frame+$locals,$tp
160 add $tp,$num,$ap_l
aa2be094 161 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
bcb43bb3
AP
162 add $ap_l,$num,$ap_h
163 add $ap_h,$num,$np_l
164 add $np_l,$num,$np_h
165
166 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
167
168 add $rp,$num,$rp ! readjust input pointers to point
169 add $ap,$num,$ap ! at the ends too...
170 add $bp,$num,$bp
171 add $np,$num,$np
172
aa2be094 173 stx %o7,[%sp+$bias+$frame+48] ! save %asi
bcb43bb3 174\f
6df8c74d
AP
175 sub %g0,$num,$i ! i=-num
176 sub %g0,$num,$j ! j=-num
bcb43bb3
AP
177
178 add $ap,$j,%o3
179 add $bp,$i,%o4
6df8c74d 180
87d3af64
AP
181 ld [%o3+4],%g1 ! bp[0]
182 ld [%o3+0],%o0
183 ld [%o4+4],%g5 ! ap[0]
184 sllx %g1,32,%g1
185 ld [%o4+0],%o1
186 sllx %g5,32,%g5
6df8c74d
AP
187 or %g1,%o0,%o0
188 or %g5,%o1,%o1
189
aa2be094 190 add $np,$j,%o5
bcb43bb3
AP
191
192 mulx %o1,%o0,%o0 ! ap[0]*bp[0]
193 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
aa2be094 194 stx %o0,[%sp+$bias+$frame+0]
bcb43bb3 195
6df8c74d 196 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
aa2be094 197 fzeros $alo
6df8c74d 198 ld [%o3+4],$ahi_
aa2be094 199 fzeros $ahi
6df8c74d 200 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
aa2be094 201 fzeros $nlo
6df8c74d 202 ld [%o5+4],$nhi_
aa2be094 203 fzeros $nhi
bcb43bb3
AP
204
205 ! transfer b[i] to FPU as 4x16-bit values
6df8c74d 206 ldda [%o4+2]%asi,$ba
bcb43bb3 207 fxtod $alo,$alo
6df8c74d 208 ldda [%o4+0]%asi,$bb
bcb43bb3 209 fxtod $ahi,$ahi
6df8c74d 210 ldda [%o4+6]%asi,$bc
bcb43bb3 211 fxtod $nlo,$nlo
6df8c74d 212 ldda [%o4+4]%asi,$bd
bcb43bb3
AP
213 fxtod $nhi,$nhi
214
215 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
aa2be094 216 ldda [%sp+$bias+$frame+6]%asi,$na
bcb43bb3 217 fxtod $ba,$ba
aa2be094 218 ldda [%sp+$bias+$frame+4]%asi,$nb
bcb43bb3 219 fxtod $bb,$bb
aa2be094 220 ldda [%sp+$bias+$frame+2]%asi,$nc
bcb43bb3 221 fxtod $bc,$bc
aa2be094 222 ldda [%sp+$bias+$frame+0]%asi,$nd
bcb43bb3
AP
223 fxtod $bd,$bd
224
225 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
226 fxtod $na,$na
227 std $ahi,[$ap_h+$j]
228 fxtod $nb,$nb
229 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
230 fxtod $nc,$nc
231 std $nhi,[$np_h+$j]
232 fxtod $nd,$nd
233
aa2be094
AP
234 fmuld $alo,$ba,$aloa
235 fmuld $nlo,$na,$nloa
236 fmuld $alo,$bb,$alob
237 fmuld $nlo,$nb,$nlob
238 fmuld $alo,$bc,$aloc
aa2be094 239 faddd $aloa,$nloa,$nloa
6df8c74d 240 fmuld $nlo,$nc,$nloc
aa2be094 241 fmuld $alo,$bd,$alod
aa2be094 242 faddd $alob,$nlob,$nlob
6df8c74d 243 fmuld $nlo,$nd,$nlod
aa2be094 244 fmuld $ahi,$ba,$ahia
aa2be094 245 faddd $aloc,$nloc,$nloc
6df8c74d 246 fmuld $nhi,$na,$nhia
aa2be094 247 fmuld $ahi,$bb,$ahib
aa2be094 248 faddd $alod,$nlod,$nlod
6df8c74d 249 fmuld $nhi,$nb,$nhib
aa2be094 250 fmuld $ahi,$bc,$ahic
aa2be094 251 faddd $ahia,$nhia,$nhia
6df8c74d 252 fmuld $nhi,$nc,$nhic
aa2be094 253 fmuld $ahi,$bd,$ahid
6df8c74d 254 faddd $ahib,$nhib,$nhib
aa2be094 255 fmuld $nhi,$nd,$nhid
bcb43bb3 256
bcb43bb3
AP
257 faddd $ahic,$nhic,$dota ! $nhic
258 faddd $ahid,$nhid,$dotb ! $nhid
259
260 faddd $nloc,$nhia,$nloc
261 faddd $nlod,$nhib,$nlod
262
263 fdtox $nloa,$nloa
264 fdtox $nlob,$nlob
265 fdtox $nloc,$nloc
266 fdtox $nlod,$nlod
267
268 std $nloa,[%sp+$bias+$frame+0]
2e21922e 269 add $j,8,$j
bcb43bb3 270 std $nlob,[%sp+$bias+$frame+8]
2e21922e 271 add $ap,$j,%o4
bcb43bb3 272 std $nloc,[%sp+$bias+$frame+16]
2e21922e 273 add $np,$j,%o5
bcb43bb3 274 std $nlod,[%sp+$bias+$frame+24]
bcb43bb3 275\f
1c3d2b94 276 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
aa2be094 277 fzeros $alo
1c3d2b94 278 ld [%o4+4],$ahi_
aa2be094 279 fzeros $ahi
1c3d2b94 280 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
aa2be094 281 fzeros $nlo
1c3d2b94 282 ld [%o5+4],$nhi_
aa2be094 283 fzeros $nhi
bcb43bb3
AP
284
285 fxtod $alo,$alo
286 fxtod $ahi,$ahi
287 fxtod $nlo,$nlo
288 fxtod $nhi,$nhi
289
2e21922e 290 ldx [%sp+$bias+$frame+0],%o0
aa2be094 291 fmuld $alo,$ba,$aloa
2e21922e 292 ldx [%sp+$bias+$frame+8],%o1
aa2be094 293 fmuld $nlo,$na,$nloa
2e21922e 294 ldx [%sp+$bias+$frame+16],%o2
aa2be094 295 fmuld $alo,$bb,$alob
2e21922e 296 ldx [%sp+$bias+$frame+24],%o3
aa2be094 297 fmuld $nlo,$nb,$nlob
2e21922e
AP
298
299 srlx %o0,16,%o7
300 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
aa2be094 301 fmuld $alo,$bc,$aloc
2e21922e
AP
302 add %o7,%o1,%o1
303 std $ahi,[$ap_h+$j]
304 faddd $aloa,$nloa,$nloa
6df8c74d 305 fmuld $nlo,$nc,$nloc
2e21922e
AP
306 srlx %o1,16,%o7
307 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
aa2be094 308 fmuld $alo,$bd,$alod
2e21922e
AP
309 add %o7,%o2,%o2
310 std $nhi,[$np_h+$j]
311 faddd $alob,$nlob,$nlob
6df8c74d 312 fmuld $nlo,$nd,$nlod
2e21922e 313 srlx %o2,16,%o7
aa2be094 314 fmuld $ahi,$ba,$ahia
2e21922e
AP
315 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
316 faddd $aloc,$nloc,$nloc
6df8c74d 317 fmuld $nhi,$na,$nhia
2e21922e
AP
318 !and %o0,$mask,%o0
319 !and %o1,$mask,%o1
320 !and %o2,$mask,%o2
321 !sllx %o1,16,%o1
322 !sllx %o2,32,%o2
323 !sllx %o3,48,%o7
324 !or %o1,%o0,%o0
325 !or %o2,%o0,%o0
326 !or %o7,%o0,%o0 ! 64-bit result
327 srlx %o3,16,%g1 ! 34-bit carry
aa2be094 328 fmuld $ahi,$bb,$ahib
2e21922e 329
aa2be094 330 faddd $alod,$nlod,$nlod
6df8c74d 331 fmuld $nhi,$nb,$nhib
aa2be094 332 fmuld $ahi,$bc,$ahic
aa2be094 333 faddd $ahia,$nhia,$nhia
6df8c74d 334 fmuld $nhi,$nc,$nhic
aa2be094 335 fmuld $ahi,$bd,$ahid
aa2be094 336 faddd $ahib,$nhib,$nhib
6df8c74d 337 fmuld $nhi,$nd,$nhid
bcb43bb3
AP
338
339 faddd $dota,$nloa,$nloa
340 faddd $dotb,$nlob,$nlob
341 faddd $ahic,$nhic,$dota ! $nhic
342 faddd $ahid,$nhid,$dotb ! $nhid
343
344 faddd $nloc,$nhia,$nloc
345 faddd $nlod,$nhib,$nlod
346
347 fdtox $nloa,$nloa
348 fdtox $nlob,$nlob
349 fdtox $nloc,$nloc
350 fdtox $nlod,$nlod
351
352 std $nloa,[%sp+$bias+$frame+0]
353 std $nlob,[%sp+$bias+$frame+8]
2e21922e 354 addcc $j,8,$j
bcb43bb3 355 std $nloc,[%sp+$bias+$frame+16]
2e21922e 356 bz,pn %icc,.L1stskip
bcb43bb3 357 std $nlod,[%sp+$bias+$frame+24]
1c3d2b94 358\f
23296942 359.align 32 ! incidentally already aligned !
1c3d2b94 360.L1st:
1c3d2b94
AP
361 add $ap,$j,%o4
362 add $np,$j,%o5
363 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
364 fzeros $alo
365 ld [%o4+4],$ahi_
366 fzeros $ahi
367 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
368 fzeros $nlo
369 ld [%o5+4],$nhi_
370 fzeros $nhi
371
372 fxtod $alo,$alo
373 fxtod $ahi,$ahi
374 fxtod $nlo,$nlo
375 fxtod $nhi,$nhi
376
2e21922e 377 ldx [%sp+$bias+$frame+0],%o0
1c3d2b94 378 fmuld $alo,$ba,$aloa
2e21922e 379 ldx [%sp+$bias+$frame+8],%o1
1c3d2b94 380 fmuld $nlo,$na,$nloa
2e21922e 381 ldx [%sp+$bias+$frame+16],%o2
1c3d2b94 382 fmuld $alo,$bb,$alob
2e21922e 383 ldx [%sp+$bias+$frame+24],%o3
1c3d2b94 384 fmuld $nlo,$nb,$nlob
2e21922e
AP
385
386 srlx %o0,16,%o7
387 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
1c3d2b94 388 fmuld $alo,$bc,$aloc
2e21922e
AP
389 add %o7,%o1,%o1
390 std $ahi,[$ap_h+$j]
391 faddd $aloa,$nloa,$nloa
1c3d2b94 392 fmuld $nlo,$nc,$nloc
2e21922e
AP
393 srlx %o1,16,%o7
394 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
1c3d2b94 395 fmuld $alo,$bd,$alod
2e21922e
AP
396 add %o7,%o2,%o2
397 std $nhi,[$np_h+$j]
398 faddd $alob,$nlob,$nlob
1c3d2b94 399 fmuld $nlo,$nd,$nlod
2e21922e 400 srlx %o2,16,%o7
1c3d2b94 401 fmuld $ahi,$ba,$ahia
2e21922e
AP
402 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
403 and %o0,$mask,%o0
404 faddd $aloc,$nloc,$nloc
1c3d2b94 405 fmuld $nhi,$na,$nhia
2e21922e
AP
406 and %o1,$mask,%o1
407 and %o2,$mask,%o2
1c3d2b94 408 fmuld $ahi,$bb,$ahib
2e21922e
AP
409 sllx %o1,16,%o1
410 faddd $alod,$nlod,$nlod
1c3d2b94 411 fmuld $nhi,$nb,$nhib
2e21922e 412 sllx %o2,32,%o2
1c3d2b94 413 fmuld $ahi,$bc,$ahic
2e21922e
AP
414 sllx %o3,48,%o7
415 or %o1,%o0,%o0
416 faddd $ahia,$nhia,$nhia
1c3d2b94 417 fmuld $nhi,$nc,$nhic
2e21922e 418 or %o2,%o0,%o0
1c3d2b94 419 fmuld $ahi,$bd,$ahid
2e21922e
AP
420 or %o7,%o0,%o0 ! 64-bit result
421 faddd $ahib,$nhib,$nhib
1c3d2b94 422 fmuld $nhi,$nd,$nhid
2e21922e
AP
423 addcc %g1,%o0,%o0
424 faddd $dota,$nloa,$nloa
425 srlx %o3,16,%g1 ! 34-bit carry
426 faddd $dotb,$nlob,$nlob
427 bcs,a %xcc,.+8
428 add %g1,1,%g1
429
430 stx %o0,[$tp] ! tp[j-1]=
1c3d2b94 431
1c3d2b94
AP
432 faddd $ahic,$nhic,$dota ! $nhic
433 faddd $ahid,$nhid,$dotb ! $nhid
434
435 faddd $nloc,$nhia,$nloc
436 faddd $nlod,$nhib,$nlod
437
438 fdtox $nloa,$nloa
439 fdtox $nlob,$nlob
440 fdtox $nloc,$nloc
441 fdtox $nlod,$nlod
442
443 std $nloa,[%sp+$bias+$frame+0]
444 std $nlob,[%sp+$bias+$frame+8]
445 std $nloc,[%sp+$bias+$frame+16]
446 std $nlod,[%sp+$bias+$frame+24]
447
aa2be094
AP
448 addcc $j,8,$j
449 bnz,pt %icc,.L1st
bcb43bb3 450 add $tp,8,$tp
1c3d2b94
AP
451\f
452.L1stskip:
ebae8092
AP
453 fdtox $dota,$dota
454 fdtox $dotb,$dotb
455
1c3d2b94
AP
456 ldx [%sp+$bias+$frame+0],%o0
457 ldx [%sp+$bias+$frame+8],%o1
458 ldx [%sp+$bias+$frame+16],%o2
459 ldx [%sp+$bias+$frame+24],%o3
460
461 srlx %o0,16,%o7
ebae8092 462 std $dota,[%sp+$bias+$frame+32]
1c3d2b94 463 add %o7,%o1,%o1
ebae8092 464 std $dotb,[%sp+$bias+$frame+40]
1c3d2b94
AP
465 srlx %o1,16,%o7
466 add %o7,%o2,%o2
467 srlx %o2,16,%o7
468 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
469 and %o0,$mask,%o0
470 and %o1,$mask,%o1
471 and %o2,$mask,%o2
472 sllx %o1,16,%o1
473 sllx %o2,32,%o2
474 sllx %o3,48,%o7
475 or %o1,%o0,%o0
476 or %o2,%o0,%o0
477 or %o7,%o0,%o0 ! 64-bit result
ebae8092 478 ldx [%sp+$bias+$frame+32],%o4
1c3d2b94 479 addcc %g1,%o0,%o0
ebae8092 480 ldx [%sp+$bias+$frame+40],%o5
1c3d2b94
AP
481 srlx %o3,16,%g1 ! 34-bit carry
482 bcs,a %xcc,.+8
483 add %g1,1,%g1
484
485 stx %o0,[$tp] ! tp[j-1]=
486 add $tp,8,$tp
bcb43bb3 487
ebae8092
AP
488 srlx %o4,16,%o7
489 add %o7,%o5,%o5
490 and %o4,$mask,%o4
491 sllx %o5,16,%o7
492 or %o7,%o4,%o4
493 addcc %g1,%o4,%o4
494 srlx %o5,48,%g1
bcb43bb3
AP
495 bcs,a %xcc,.+8
496 add %g1,1,%g1
497
498 mov %g1,$carry
ebae8092 499 stx %o4,[$tp] ! tp[num-1]=
bcb43bb3
AP
500\f
501 ba .Louter
502 add $i,8,$i
503.align 32
504.Louter:
6df8c74d 505 sub %g0,$num,$j ! j=-num
bcb43bb3
AP
506 add %sp,$bias+$frame+$locals,$tp
507
87d3af64 508 add $ap,$j,%o3
bcb43bb3 509 add $bp,$i,%o4
6df8c74d 510
87d3af64
AP
511 ld [%o3+4],%g1 ! bp[i]
512 ld [%o3+0],%o0
513 ld [%o4+4],%g5 ! ap[0]
514 sllx %g1,32,%g1
515 ld [%o4+0],%o1
516 sllx %g5,32,%g5
6df8c74d
AP
517 or %g1,%o0,%o0
518 or %g5,%o1,%o1
519
bcb43bb3
AP
520 ldx [$tp],%o2 ! tp[0]
521 mulx %o1,%o0,%o0
522 addcc %o2,%o0,%o0
523 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
aa2be094 524 stx %o0,[%sp+$bias+$frame+0]
bcb43bb3 525
bcb43bb3 526 ! transfer b[i] to FPU as 4x16-bit values
6df8c74d
AP
527 ldda [%o4+2]%asi,$ba
528 ldda [%o4+0]%asi,$bb
529 ldda [%o4+6]%asi,$bc
530 ldda [%o4+4]%asi,$bd
bcb43bb3
AP
531
532 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
aa2be094 533 ldda [%sp+$bias+$frame+6]%asi,$na
bcb43bb3 534 fxtod $ba,$ba
aa2be094 535 ldda [%sp+$bias+$frame+4]%asi,$nb
bcb43bb3 536 fxtod $bb,$bb
aa2be094 537 ldda [%sp+$bias+$frame+2]%asi,$nc
bcb43bb3 538 fxtod $bc,$bc
aa2be094 539 ldda [%sp+$bias+$frame+0]%asi,$nd
bcb43bb3
AP
540 fxtod $bd,$bd
541 ldd [$ap_l+$j],$alo ! load a[j] in double format
542 fxtod $na,$na
543 ldd [$ap_h+$j],$ahi
544 fxtod $nb,$nb
545 ldd [$np_l+$j],$nlo ! load n[j] in double format
546 fxtod $nc,$nc
547 ldd [$np_h+$j],$nhi
548 fxtod $nd,$nd
549
aa2be094
AP
550 fmuld $alo,$ba,$aloa
551 fmuld $nlo,$na,$nloa
552 fmuld $alo,$bb,$alob
553 fmuld $nlo,$nb,$nlob
554 fmuld $alo,$bc,$aloc
aa2be094 555 faddd $aloa,$nloa,$nloa
6df8c74d 556 fmuld $nlo,$nc,$nloc
aa2be094 557 fmuld $alo,$bd,$alod
aa2be094 558 faddd $alob,$nlob,$nlob
6df8c74d 559 fmuld $nlo,$nd,$nlod
aa2be094 560 fmuld $ahi,$ba,$ahia
aa2be094 561 faddd $aloc,$nloc,$nloc
6df8c74d 562 fmuld $nhi,$na,$nhia
aa2be094 563 fmuld $ahi,$bb,$ahib
aa2be094 564 faddd $alod,$nlod,$nlod
6df8c74d 565 fmuld $nhi,$nb,$nhib
aa2be094 566 fmuld $ahi,$bc,$ahic
aa2be094 567 faddd $ahia,$nhia,$nhia
6df8c74d 568 fmuld $nhi,$nc,$nhic
aa2be094 569 fmuld $ahi,$bd,$ahid
6df8c74d 570 faddd $ahib,$nhib,$nhib
aa2be094 571 fmuld $nhi,$nd,$nhid
bcb43bb3 572
bcb43bb3
AP
573 faddd $ahic,$nhic,$dota ! $nhic
574 faddd $ahid,$nhid,$dotb ! $nhid
575
576 faddd $nloc,$nhia,$nloc
577 faddd $nlod,$nhib,$nlod
578
579 fdtox $nloa,$nloa
580 fdtox $nlob,$nlob
581 fdtox $nloc,$nloc
582 fdtox $nlod,$nlod
583
584 std $nloa,[%sp+$bias+$frame+0]
585 std $nlob,[%sp+$bias+$frame+8]
586 std $nloc,[%sp+$bias+$frame+16]
2e21922e 587 add $j,8,$j
bcb43bb3 588 std $nlod,[%sp+$bias+$frame+24]
2e21922e
AP
589\f
590 ldd [$ap_l+$j],$alo ! load a[j] in double format
591 ldd [$ap_h+$j],$ahi
592 ldd [$np_l+$j],$nlo ! load n[j] in double format
593 ldd [$np_h+$j],$nhi
594
595 fmuld $alo,$ba,$aloa
596 fmuld $nlo,$na,$nloa
597 fmuld $alo,$bb,$alob
598 fmuld $nlo,$nb,$nlob
599 fmuld $alo,$bc,$aloc
bcb43bb3 600 ldx [%sp+$bias+$frame+0],%o0
2e21922e
AP
601 faddd $aloa,$nloa,$nloa
602 fmuld $nlo,$nc,$nloc
bcb43bb3 603 ldx [%sp+$bias+$frame+8],%o1
2e21922e 604 fmuld $alo,$bd,$alod
bcb43bb3 605 ldx [%sp+$bias+$frame+16],%o2
2e21922e
AP
606 faddd $alob,$nlob,$nlob
607 fmuld $nlo,$nd,$nlod
bcb43bb3 608 ldx [%sp+$bias+$frame+24],%o3
2e21922e 609 fmuld $ahi,$ba,$ahia
bcb43bb3
AP
610
611 srlx %o0,16,%o7
2e21922e
AP
612 faddd $aloc,$nloc,$nloc
613 fmuld $nhi,$na,$nhia
bcb43bb3 614 add %o7,%o1,%o1
2e21922e 615 fmuld $ahi,$bb,$ahib
bcb43bb3 616 srlx %o1,16,%o7
2e21922e
AP
617 faddd $alod,$nlod,$nlod
618 fmuld $nhi,$nb,$nhib
bcb43bb3 619 add %o7,%o2,%o2
2e21922e 620 fmuld $ahi,$bc,$ahic
bcb43bb3 621 srlx %o2,16,%o7
2e21922e
AP
622 faddd $ahia,$nhia,$nhia
623 fmuld $nhi,$nc,$nhic
bcb43bb3
AP
624 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
625 ! why?
626 and %o0,$mask,%o0
2e21922e 627 fmuld $ahi,$bd,$ahid
bcb43bb3
AP
628 and %o1,$mask,%o1
629 and %o2,$mask,%o2
2e21922e
AP
630 faddd $ahib,$nhib,$nhib
631 fmuld $nhi,$nd,$nhid
bcb43bb3 632 sllx %o1,16,%o1
2e21922e 633 faddd $dota,$nloa,$nloa
bcb43bb3 634 sllx %o2,32,%o2
2e21922e 635 faddd $dotb,$nlob,$nlob
bcb43bb3
AP
636 sllx %o3,48,%o7
637 or %o1,%o0,%o0
2e21922e 638 faddd $ahic,$nhic,$dota ! $nhic
bcb43bb3 639 or %o2,%o0,%o0
2e21922e 640 faddd $ahid,$nhid,$dotb ! $nhid
bcb43bb3
AP
641 or %o7,%o0,%o0 ! 64-bit result
642 ldx [$tp],%o7
2e21922e 643 faddd $nloc,$nhia,$nloc
bcb43bb3
AP
644 addcc %o7,%o0,%o0
645 ! end-of-why?
2e21922e 646 faddd $nlod,$nhib,$nlod
bcb43bb3 647 srlx %o3,16,%g1 ! 34-bit carry
2e21922e 648 fdtox $nloa,$nloa
bcb43bb3
AP
649 bcs,a %xcc,.+8
650 add %g1,1,%g1
bcb43bb3 651
bcb43bb3
AP
652 fdtox $nlob,$nlob
653 fdtox $nloc,$nloc
654 fdtox $nlod,$nlod
655
656 std $nloa,[%sp+$bias+$frame+0]
657 std $nlob,[%sp+$bias+$frame+8]
2e21922e 658 addcc $j,8,$j
bcb43bb3 659 std $nloc,[%sp+$bias+$frame+16]
2e21922e 660 bz,pn %icc,.Linnerskip
bcb43bb3 661 std $nlod,[%sp+$bias+$frame+24]
1c3d2b94 662\f
ebae8092
AP
663 ba .Linner
664 nop
665.align 32
1c3d2b94 666.Linner:
2e21922e
AP
667 ldd [$ap_l+$j],$alo ! load a[j] in double format
668 ldd [$ap_h+$j],$ahi
669 ldd [$np_l+$j],$nlo ! load n[j] in double format
670 ldd [$np_h+$j],$nhi
671
672 fmuld $alo,$ba,$aloa
673 fmuld $nlo,$na,$nloa
674 fmuld $alo,$bb,$alob
675 fmuld $nlo,$nb,$nlob
676 fmuld $alo,$bc,$aloc
bcb43bb3 677 ldx [%sp+$bias+$frame+0],%o0
2e21922e
AP
678 faddd $aloa,$nloa,$nloa
679 fmuld $nlo,$nc,$nloc
bcb43bb3 680 ldx [%sp+$bias+$frame+8],%o1
2e21922e 681 fmuld $alo,$bd,$alod
bcb43bb3 682 ldx [%sp+$bias+$frame+16],%o2
2e21922e
AP
683 faddd $alob,$nlob,$nlob
684 fmuld $nlo,$nd,$nlod
bcb43bb3 685 ldx [%sp+$bias+$frame+24],%o3
2e21922e 686 fmuld $ahi,$ba,$ahia
bcb43bb3
AP
687
688 srlx %o0,16,%o7
2e21922e
AP
689 faddd $aloc,$nloc,$nloc
690 fmuld $nhi,$na,$nhia
bcb43bb3 691 add %o7,%o1,%o1
2e21922e 692 fmuld $ahi,$bb,$ahib
bcb43bb3 693 srlx %o1,16,%o7
2e21922e
AP
694 faddd $alod,$nlod,$nlod
695 fmuld $nhi,$nb,$nhib
bcb43bb3 696 add %o7,%o2,%o2
2e21922e 697 fmuld $ahi,$bc,$ahic
bcb43bb3 698 srlx %o2,16,%o7
2e21922e
AP
699 faddd $ahia,$nhia,$nhia
700 fmuld $nhi,$nc,$nhic
bcb43bb3
AP
701 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
702 and %o0,$mask,%o0
2e21922e 703 fmuld $ahi,$bd,$ahid
bcb43bb3
AP
704 and %o1,$mask,%o1
705 and %o2,$mask,%o2
2e21922e
AP
706 faddd $ahib,$nhib,$nhib
707 fmuld $nhi,$nd,$nhid
bcb43bb3 708 sllx %o1,16,%o1
2e21922e 709 faddd $dota,$nloa,$nloa
bcb43bb3 710 sllx %o2,32,%o2
2e21922e 711 faddd $dotb,$nlob,$nlob
bcb43bb3
AP
712 sllx %o3,48,%o7
713 or %o1,%o0,%o0
2e21922e 714 faddd $ahic,$nhic,$dota ! $nhic
bcb43bb3 715 or %o2,%o0,%o0
2e21922e 716 faddd $ahid,$nhid,$dotb ! $nhid
bcb43bb3 717 or %o7,%o0,%o0 ! 64-bit result
2e21922e 718 faddd $nloc,$nhia,$nloc
bcb43bb3 719 addcc %g1,%o0,%o0
ebae8092 720 ldx [$tp+8],%o7 ! tp[j]
2e21922e 721 faddd $nlod,$nhib,$nlod
bcb43bb3 722 srlx %o3,16,%g1 ! 34-bit carry
2e21922e 723 fdtox $nloa,$nloa
bcb43bb3
AP
724 bcs,a %xcc,.+8
725 add %g1,1,%g1
2e21922e 726 fdtox $nlob,$nlob
bcb43bb3 727 addcc %o7,%o0,%o0
2e21922e 728 fdtox $nloc,$nloc
bcb43bb3
AP
729 bcs,a %xcc,.+8
730 add %g1,1,%g1
731
732 stx %o0,[$tp] ! tp[j-1]
2e21922e 733 fdtox $nlod,$nlod
1c3d2b94
AP
734
735 std $nloa,[%sp+$bias+$frame+0]
736 std $nlob,[%sp+$bias+$frame+8]
737 std $nloc,[%sp+$bias+$frame+16]
aa2be094 738 addcc $j,8,$j
2e21922e 739 std $nlod,[%sp+$bias+$frame+24]
aa2be094 740 bnz,pt %icc,.Linner
bcb43bb3 741 add $tp,8,$tp
1c3d2b94
AP
742\f
743.Linnerskip:
2e21922e
AP
744 fdtox $dota,$dota
745 fdtox $dotb,$dotb
746
1c3d2b94
AP
747 ldx [%sp+$bias+$frame+0],%o0
748 ldx [%sp+$bias+$frame+8],%o1
749 ldx [%sp+$bias+$frame+16],%o2
750 ldx [%sp+$bias+$frame+24],%o3
751
752 srlx %o0,16,%o7
2e21922e 753 std $dota,[%sp+$bias+$frame+32]
1c3d2b94 754 add %o7,%o1,%o1
2e21922e 755 std $dotb,[%sp+$bias+$frame+40]
1c3d2b94
AP
756 srlx %o1,16,%o7
757 add %o7,%o2,%o2
758 srlx %o2,16,%o7
759 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
760 and %o0,$mask,%o0
761 and %o1,$mask,%o1
762 and %o2,$mask,%o2
763 sllx %o1,16,%o1
764 sllx %o2,32,%o2
765 sllx %o3,48,%o7
766 or %o1,%o0,%o0
767 or %o2,%o0,%o0
2e21922e 768 ldx [%sp+$bias+$frame+32],%o4
1c3d2b94 769 or %o7,%o0,%o0 ! 64-bit result
2e21922e 770 ldx [%sp+$bias+$frame+40],%o5
1c3d2b94 771 addcc %g1,%o0,%o0
2e21922e 772 ldx [$tp+8],%o7 ! tp[j]
1c3d2b94
AP
773 srlx %o3,16,%g1 ! 34-bit carry
774 bcs,a %xcc,.+8
775 add %g1,1,%g1
776
1c3d2b94
AP
777 addcc %o7,%o0,%o0
778 bcs,a %xcc,.+8
779 add %g1,1,%g1
780
781 stx %o0,[$tp] ! tp[j-1]
782 add $tp,8,$tp
bcb43bb3 783
2e21922e
AP
784 srlx %o4,16,%o7
785 add %o7,%o5,%o5
786 and %o4,$mask,%o4
787 sllx %o5,16,%o7
788 or %o7,%o4,%o4
789 addcc %g1,%o4,%o4
790 srlx %o5,48,%g1
bcb43bb3
AP
791 bcs,a %xcc,.+8
792 add %g1,1,%g1
793
2e21922e
AP
794 addcc $carry,%o4,%o4
795 stx %o4,[$tp] ! tp[num-1]
bcb43bb3
AP
796 mov %g1,$carry
797 bcs,a %xcc,.+8
798 add $carry,1,$carry
799
aa2be094
AP
800 addcc $i,8,$i
801 bnz %icc,.Louter
bcb43bb3
AP
802 nop
803\f
7d9cf7c0 804 add $tp,8,$tp ! adjust tp to point at the end
7d9cf7c0 805 orn %g0,%g0,%g4
7d9cf7c0 806 sub %g0,$num,%o7 ! n=-num
23296942 807 ba .Lsub
673c55a2 808 subcc %g0,%g0,%g0 ! clear %icc.c
23296942
AP
809
810.align 32
bcb43bb3 811.Lsub:
87d3af64
AP
812 ldx [$tp+%o7],%o0
813 add $np,%o7,%g1
814 ld [%g1+0],%o2
815 ld [%g1+4],%o3
816 srlx %o0,32,%o1
817 subccc %o0,%o2,%o2
818 add $rp,%o7,%g1
819 subccc %o1,%o3,%o3
820 st %o2,[%g1+0]
aa2be094
AP
821 add %o7,8,%o7
822 brnz,pt %o7,.Lsub
87d3af64 823 st %o3,[%g1+4]
7d9cf7c0 824 subc $carry,0,%g4
6df8c74d 825 sub %g0,$num,%o7 ! n=-num
23296942
AP
826 ba .Lcopy
827 nop
bcb43bb3 828
23296942 829.align 32
bcb43bb3 830.Lcopy:
aa2be094 831 ldx [$tp+%o7],%o0
87d3af64 832 add $rp,%o7,%g1
7d9cf7c0
AP
833 ld [%g1+0],%o2
834 ld [%g1+4],%o3
835 stx %g0,[$tp+%o7]
836 and %o0,%g4,%o0
837 srlx %o0,32,%o1
838 andn %o2,%g4,%o2
839 andn %o3,%g4,%o3
840 or %o2,%o0,%o0
841 or %o3,%o1,%o1
87d3af64 842 st %o0,[%g1+0]
aa2be094
AP
843 add %o7,8,%o7
844 brnz,pt %o7,.Lcopy
87d3af64 845 st %o1,[%g1+4]
6df8c74d 846 sub %g0,$num,%o7 ! n=-num
bcb43bb3 847
bcb43bb3 848.Lzap:
aa2be094
AP
849 stx %g0,[$ap_l+%o7]
850 stx %g0,[$ap_h+%o7]
851 stx %g0,[$np_l+%o7]
852 stx %g0,[$np_h+%o7]
853 add %o7,8,%o7
854 brnz,pt %o7,.Lzap
bcb43bb3
AP
855 nop
856
857 ldx [%sp+$bias+$frame+48],%o7
858 wr %g0,%o7,%asi ! restore %asi
859
860 mov 1,%i0
aa2be094 861.Lret:
bcb43bb3
AP
862 ret
863 restore
864.type $fname,#function
865.size $fname,(.-$fname)
87d3af64 866.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
23296942 867.align 32
bcb43bb3
AP
868___
869
870$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3b4a0225
AP
871
872# Below substitution makes it possible to compile without demanding
478b50cf 873# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
3b4a0225
AP
874# dare to do this, because VIS capability is detected at run-time now
875# and this routine is not called on CPU not capable to execute it. Do
876# note that fzeros is not the only VIS dependency! Another dependency
877# is implicit and is just _a_ numerical value loaded to %asi register,
878# which assembler can't recognize as VIS specific...
879$code =~ s/fzeros\s+%f([0-9]+)/
880 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
881 /gem;
882
bcb43bb3 883print $code;
3b4a0225 884# flush
bcb43bb3 885close STDOUT;