]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/sparcv9a-mont.pl
Clarify HAL SPARC64 support situation in sparcv9a-mont.pl.
[thirdparty/openssl.git] / crypto / bn / asm / sparcv9a-mont.pl
CommitLineData
bcb43bb3
AP
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
2e21922e
AP
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
bcb43bb3
AP
8# ====================================================================
9
aa2be094
AP
10# October 2005
11#
bcb43bb3
AP
12# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
13# Because unlike integer multiplier, which simply stalls whole CPU,
14# FPU is fully pipelined and can effectively emit 48 bit partial
15# product every cycle. Why not blended SPARC v9? One can argue that
16# making this module dependent on UltraSPARC VIS extension limits its
a4d729f3
AP
17# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
18# implementations from compatibility matrix. But the rest, whole Sun
19# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
20# VIS extension instructions used in this module. This is considered
73b979e6
AP
21# good enough to not care about HAL SPARC64 users [if any] who have
22# integer-only pure SPARCv9 module to "fall down" to.
bcb43bb3
AP
23
24# USI&II cores currently exhibit uniform 2x improvement [over pre-
25# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
26# performance improves few percents for shorter keys and worsens few
aa2be094 27# percents for longer keys. This is because USIII integer multiplier
bcb43bb3
AP
28# is >3x faster than USI&II one, which is harder to match [but see
29# TODO list below]. It should also be noted that SPARC64 V features
30# out-of-order execution, which *might* mean that integer multiplier
a4d729f3
AP
31# is pipelined, which in turn *might* be impossible to match... On
32# additional note, SPARC64 V implements FP Multiply-Add instruction,
33# which is perfectly usable in this context... In other words, as far
73b979e6 34# as Fujitsu SPARC64 V goes, talk to the author:-)
aa2be094 35
a00e414f
AP
36# The implementation implies following "non-natural" limitations on
37# input arguments:
aa2be094
AP
38# - num may not be less than 4;
39# - num has to be even;
40# - ap, bp, rp, np has to be 64-bit aligned [which is not a problem
41# as long as BIGNUM.d are malloc-ated];
42# Failure to meet either condition has no fatal effects, simply
43# doesn't give any performance gain.
44
bcb43bb3 45# TODO:
bcb43bb3
AP
46# - modulo-schedule inner loop for better performance (on in-order
47# execution core such as UltraSPARC this shall result in further
48# noticeable(!) improvement);
49# - dedicated squaring procedure[?];
50
2e21922e
AP
51######################################################################
52# November 2006
53#
54# Modulo-scheduled inner loops allow to interleave floating point and
55# integer instructions and minimize Read-After-Write penalties. This
56# results in *further* 20-50% perfromance improvement [depending on
57# key length, more for longer keys] on USI&II cores and 30-80% - on
58# USIII&IV.
59
a00e414f 60$fname="bn_mul_mont_fpu";
bcb43bb3 61$bits=32;
3b4a0225 62for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
bcb43bb3
AP
63
64if ($bits==64) {
65 $bias=2047;
66 $frame=192;
67} else {
68 $bias=0;
69 $frame=128; # 96 rounded up to largest known cache-line
70}
71$locals=64;
72
73# In order to provide for 32-/64-bit ABI duality, I keep integers wider
74# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
75# exclusively for pointers, indexes and other small values...
76# int bn_mul_mont(
77$rp="%i0"; # BN_ULONG *rp,
78$ap="%i1"; # const BN_ULONG *ap,
79$bp="%i2"; # const BN_ULONG *bp,
80$np="%i3"; # const BN_ULONG *np,
4d524040 81$n0="%i4"; # const BN_ULONG *n0,
bcb43bb3
AP
82$num="%i5"; # int num);
83
aa2be094 84$tp="%l0"; # t[num]
bcb43bb3
AP
85$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
86$ap_h="%l2"; # to these four vectors as double-precision FP values.
87$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
88$np_h="%l4"; # loop and L1-cache aliasing is minimized...
89$i="%l5";
90$j="%l6";
91$mask="%l7"; # 16-bit mask, 0xffff
92
aa2be094
AP
93$n0="%g4"; # reassigned(!) to "64-bit" register
94$carry="%i4"; # %i4 reused(!) for a carry bit
bcb43bb3
AP
95
96# FP register naming chart
97#
98# ..HILO
99# dcba
100# --------
101# LOa
102# LOb
103# LOc
104# LOd
105# HIa
106# HIb
107# HIc
108# HId
109# ..a
110# ..b
111$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
112$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
113$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
114$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
115
116$dota="%f24"; $dotb="%f26";
117
118$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
119$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
120$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
121$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
122
123$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
124
125$code=<<___;
126.ident "UltraSPARC Montgomery multiply by <appro\@fy.chalmers.se>"
127.section ".text",#alloc,#execinstr
128
129.global $fname
130.align 32
131$fname:
aa2be094 132 save %sp,-$frame-$locals,%sp
bcb43bb3 133 sethi %hi(0xffff),$mask
bcb43bb3 134 or $mask,%lo(0xffff),$mask
6df8c74d 135
aa2be094
AP
136 cmp $num,4
137 bl,a,pn %icc,.Lret
138 clr %i0
139 andcc $num,1,%g0 ! $num has to be even...
140 bnz,a,pn %icc,.Lret
141 clr %i0 ! signal "unsupported input value"
142 or $bp,$ap,%l0
143 srl $num,1,$num
144 or $rp,$np,%l1
145 or %l0,%l1,%l0
146 andcc %l0,7,%g0 ! ...and pointers has to be 8-byte aligned
147 bnz,a,pn %icc,.Lret
148 clr %i0 ! signal "unsupported input value"
149 ld [%i4+0],$n0 ! $n0 reassigned, remember?
150 ld [%i4+4],%o0
151 sllx %o0,32,%o0
152 or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
6df8c74d 153
aa2be094 154 sll $num,3,$num ! num*=8
bcb43bb3
AP
155
156 add %sp,$bias,%o0 ! real top of stack
157 sll $num,2,%o1
158 add %o1,$num,%o1 ! %o1=num*5
159 sub %o0,%o1,%o0
bcb43bb3 160 and %o0,-2048,%o0 ! optimize TLB utilization
aa2be094 161 sub %o0,$bias,%sp ! alloca(5*num*8)
bcb43bb3 162
aa2be094 163 rd %asi,%o7 ! save %asi
bcb43bb3
AP
164 add %sp,$bias+$frame+$locals,$tp
165 add $tp,$num,$ap_l
aa2be094 166 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
bcb43bb3
AP
167 add $ap_l,$num,$ap_h
168 add $ap_h,$num,$np_l
169 add $np_l,$num,$np_h
170
171 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
172
173 add $rp,$num,$rp ! readjust input pointers to point
174 add $ap,$num,$ap ! at the ends too...
175 add $bp,$num,$bp
176 add $np,$num,$np
177
aa2be094 178 stx %o7,[%sp+$bias+$frame+48] ! save %asi
bcb43bb3 179\f
6df8c74d
AP
180 sub %g0,$num,$i ! i=-num
181 sub %g0,$num,$j ! j=-num
bcb43bb3
AP
182
183 add $ap,$j,%o3
184 add $bp,$i,%o4
6df8c74d 185
bcb43bb3 186 ldx [$bp+$i],%o0 ! bp[0]
bcb43bb3 187 ldx [$ap+$j],%o1 ! ap[0]
6df8c74d
AP
188 sllx %o0,32,%g1
189 sllx %o1,32,%g5
190 srlx %o0,32,%o0
191 srlx %o1,32,%o1
192 or %g1,%o0,%o0
193 or %g5,%o1,%o1
194
aa2be094 195 add $np,$j,%o5
bcb43bb3
AP
196
197 mulx %o1,%o0,%o0 ! ap[0]*bp[0]
198 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
aa2be094 199 stx %o0,[%sp+$bias+$frame+0]
bcb43bb3 200
6df8c74d 201 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
aa2be094 202 fzeros $alo
6df8c74d 203 ld [%o3+4],$ahi_
aa2be094 204 fzeros $ahi
6df8c74d 205 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
aa2be094 206 fzeros $nlo
6df8c74d 207 ld [%o5+4],$nhi_
aa2be094 208 fzeros $nhi
bcb43bb3
AP
209
210 ! transfer b[i] to FPU as 4x16-bit values
6df8c74d 211 ldda [%o4+2]%asi,$ba
bcb43bb3 212 fxtod $alo,$alo
6df8c74d 213 ldda [%o4+0]%asi,$bb
bcb43bb3 214 fxtod $ahi,$ahi
6df8c74d 215 ldda [%o4+6]%asi,$bc
bcb43bb3 216 fxtod $nlo,$nlo
6df8c74d 217 ldda [%o4+4]%asi,$bd
bcb43bb3
AP
218 fxtod $nhi,$nhi
219
220 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
aa2be094 221 ldda [%sp+$bias+$frame+6]%asi,$na
bcb43bb3 222 fxtod $ba,$ba
aa2be094 223 ldda [%sp+$bias+$frame+4]%asi,$nb
bcb43bb3 224 fxtod $bb,$bb
aa2be094 225 ldda [%sp+$bias+$frame+2]%asi,$nc
bcb43bb3 226 fxtod $bc,$bc
aa2be094 227 ldda [%sp+$bias+$frame+0]%asi,$nd
bcb43bb3
AP
228 fxtod $bd,$bd
229
230 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
231 fxtod $na,$na
232 std $ahi,[$ap_h+$j]
233 fxtod $nb,$nb
234 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
235 fxtod $nc,$nc
236 std $nhi,[$np_h+$j]
237 fxtod $nd,$nd
238
aa2be094
AP
239 fmuld $alo,$ba,$aloa
240 fmuld $nlo,$na,$nloa
241 fmuld $alo,$bb,$alob
242 fmuld $nlo,$nb,$nlob
243 fmuld $alo,$bc,$aloc
aa2be094 244 faddd $aloa,$nloa,$nloa
6df8c74d 245 fmuld $nlo,$nc,$nloc
aa2be094 246 fmuld $alo,$bd,$alod
aa2be094 247 faddd $alob,$nlob,$nlob
6df8c74d 248 fmuld $nlo,$nd,$nlod
aa2be094 249 fmuld $ahi,$ba,$ahia
aa2be094 250 faddd $aloc,$nloc,$nloc
6df8c74d 251 fmuld $nhi,$na,$nhia
aa2be094 252 fmuld $ahi,$bb,$ahib
aa2be094 253 faddd $alod,$nlod,$nlod
6df8c74d 254 fmuld $nhi,$nb,$nhib
aa2be094 255 fmuld $ahi,$bc,$ahic
aa2be094 256 faddd $ahia,$nhia,$nhia
6df8c74d 257 fmuld $nhi,$nc,$nhic
aa2be094 258 fmuld $ahi,$bd,$ahid
6df8c74d 259 faddd $ahib,$nhib,$nhib
aa2be094 260 fmuld $nhi,$nd,$nhid
bcb43bb3 261
bcb43bb3
AP
262 faddd $ahic,$nhic,$dota ! $nhic
263 faddd $ahid,$nhid,$dotb ! $nhid
264
265 faddd $nloc,$nhia,$nloc
266 faddd $nlod,$nhib,$nlod
267
268 fdtox $nloa,$nloa
269 fdtox $nlob,$nlob
270 fdtox $nloc,$nloc
271 fdtox $nlod,$nlod
272
273 std $nloa,[%sp+$bias+$frame+0]
2e21922e 274 add $j,8,$j
bcb43bb3 275 std $nlob,[%sp+$bias+$frame+8]
2e21922e 276 add $ap,$j,%o4
bcb43bb3 277 std $nloc,[%sp+$bias+$frame+16]
2e21922e 278 add $np,$j,%o5
bcb43bb3 279 std $nlod,[%sp+$bias+$frame+24]
bcb43bb3 280\f
1c3d2b94 281 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
aa2be094 282 fzeros $alo
1c3d2b94 283 ld [%o4+4],$ahi_
aa2be094 284 fzeros $ahi
1c3d2b94 285 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
aa2be094 286 fzeros $nlo
1c3d2b94 287 ld [%o5+4],$nhi_
aa2be094 288 fzeros $nhi
bcb43bb3
AP
289
290 fxtod $alo,$alo
291 fxtod $ahi,$ahi
292 fxtod $nlo,$nlo
293 fxtod $nhi,$nhi
294
2e21922e 295 ldx [%sp+$bias+$frame+0],%o0
aa2be094 296 fmuld $alo,$ba,$aloa
2e21922e 297 ldx [%sp+$bias+$frame+8],%o1
aa2be094 298 fmuld $nlo,$na,$nloa
2e21922e 299 ldx [%sp+$bias+$frame+16],%o2
aa2be094 300 fmuld $alo,$bb,$alob
2e21922e 301 ldx [%sp+$bias+$frame+24],%o3
aa2be094 302 fmuld $nlo,$nb,$nlob
2e21922e
AP
303
304 srlx %o0,16,%o7
305 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
aa2be094 306 fmuld $alo,$bc,$aloc
2e21922e
AP
307 add %o7,%o1,%o1
308 std $ahi,[$ap_h+$j]
309 faddd $aloa,$nloa,$nloa
6df8c74d 310 fmuld $nlo,$nc,$nloc
2e21922e
AP
311 srlx %o1,16,%o7
312 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
aa2be094 313 fmuld $alo,$bd,$alod
2e21922e
AP
314 add %o7,%o2,%o2
315 std $nhi,[$np_h+$j]
316 faddd $alob,$nlob,$nlob
6df8c74d 317 fmuld $nlo,$nd,$nlod
2e21922e 318 srlx %o2,16,%o7
aa2be094 319 fmuld $ahi,$ba,$ahia
2e21922e
AP
320 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
321 faddd $aloc,$nloc,$nloc
6df8c74d 322 fmuld $nhi,$na,$nhia
2e21922e
AP
323 !and %o0,$mask,%o0
324 !and %o1,$mask,%o1
325 !and %o2,$mask,%o2
326 !sllx %o1,16,%o1
327 !sllx %o2,32,%o2
328 !sllx %o3,48,%o7
329 !or %o1,%o0,%o0
330 !or %o2,%o0,%o0
331 !or %o7,%o0,%o0 ! 64-bit result
332 srlx %o3,16,%g1 ! 34-bit carry
aa2be094 333 fmuld $ahi,$bb,$ahib
2e21922e 334
aa2be094 335 faddd $alod,$nlod,$nlod
6df8c74d 336 fmuld $nhi,$nb,$nhib
aa2be094 337 fmuld $ahi,$bc,$ahic
aa2be094 338 faddd $ahia,$nhia,$nhia
6df8c74d 339 fmuld $nhi,$nc,$nhic
aa2be094 340 fmuld $ahi,$bd,$ahid
aa2be094 341 faddd $ahib,$nhib,$nhib
6df8c74d 342 fmuld $nhi,$nd,$nhid
bcb43bb3
AP
343
344 faddd $dota,$nloa,$nloa
345 faddd $dotb,$nlob,$nlob
346 faddd $ahic,$nhic,$dota ! $nhic
347 faddd $ahid,$nhid,$dotb ! $nhid
348
349 faddd $nloc,$nhia,$nloc
350 faddd $nlod,$nhib,$nlod
351
352 fdtox $nloa,$nloa
353 fdtox $nlob,$nlob
354 fdtox $nloc,$nloc
355 fdtox $nlod,$nlod
356
357 std $nloa,[%sp+$bias+$frame+0]
358 std $nlob,[%sp+$bias+$frame+8]
2e21922e 359 addcc $j,8,$j
bcb43bb3 360 std $nloc,[%sp+$bias+$frame+16]
2e21922e 361 bz,pn %icc,.L1stskip
bcb43bb3 362 std $nlod,[%sp+$bias+$frame+24]
1c3d2b94 363\f
1c3d2b94
AP
364.align 32,0x1000000
365.L1st:
1c3d2b94
AP
366 add $ap,$j,%o4
367 add $np,$j,%o5
368 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
369 fzeros $alo
370 ld [%o4+4],$ahi_
371 fzeros $ahi
372 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
373 fzeros $nlo
374 ld [%o5+4],$nhi_
375 fzeros $nhi
376
377 fxtod $alo,$alo
378 fxtod $ahi,$ahi
379 fxtod $nlo,$nlo
380 fxtod $nhi,$nhi
381
2e21922e 382 ldx [%sp+$bias+$frame+0],%o0
1c3d2b94 383 fmuld $alo,$ba,$aloa
2e21922e 384 ldx [%sp+$bias+$frame+8],%o1
1c3d2b94 385 fmuld $nlo,$na,$nloa
2e21922e 386 ldx [%sp+$bias+$frame+16],%o2
1c3d2b94 387 fmuld $alo,$bb,$alob
2e21922e 388 ldx [%sp+$bias+$frame+24],%o3
1c3d2b94 389 fmuld $nlo,$nb,$nlob
2e21922e
AP
390
391 srlx %o0,16,%o7
392 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
1c3d2b94 393 fmuld $alo,$bc,$aloc
2e21922e
AP
394 add %o7,%o1,%o1
395 std $ahi,[$ap_h+$j]
396 faddd $aloa,$nloa,$nloa
1c3d2b94 397 fmuld $nlo,$nc,$nloc
2e21922e
AP
398 srlx %o1,16,%o7
399 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
1c3d2b94 400 fmuld $alo,$bd,$alod
2e21922e
AP
401 add %o7,%o2,%o2
402 std $nhi,[$np_h+$j]
403 faddd $alob,$nlob,$nlob
1c3d2b94 404 fmuld $nlo,$nd,$nlod
2e21922e 405 srlx %o2,16,%o7
1c3d2b94 406 fmuld $ahi,$ba,$ahia
2e21922e
AP
407 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
408 and %o0,$mask,%o0
409 faddd $aloc,$nloc,$nloc
1c3d2b94 410 fmuld $nhi,$na,$nhia
2e21922e
AP
411 and %o1,$mask,%o1
412 and %o2,$mask,%o2
1c3d2b94 413 fmuld $ahi,$bb,$ahib
2e21922e
AP
414 sllx %o1,16,%o1
415 faddd $alod,$nlod,$nlod
1c3d2b94 416 fmuld $nhi,$nb,$nhib
2e21922e 417 sllx %o2,32,%o2
1c3d2b94 418 fmuld $ahi,$bc,$ahic
2e21922e
AP
419 sllx %o3,48,%o7
420 or %o1,%o0,%o0
421 faddd $ahia,$nhia,$nhia
1c3d2b94 422 fmuld $nhi,$nc,$nhic
2e21922e 423 or %o2,%o0,%o0
1c3d2b94 424 fmuld $ahi,$bd,$ahid
2e21922e
AP
425 or %o7,%o0,%o0 ! 64-bit result
426 faddd $ahib,$nhib,$nhib
1c3d2b94 427 fmuld $nhi,$nd,$nhid
2e21922e
AP
428 addcc %g1,%o0,%o0
429 faddd $dota,$nloa,$nloa
430 srlx %o3,16,%g1 ! 34-bit carry
431 faddd $dotb,$nlob,$nlob
432 bcs,a %xcc,.+8
433 add %g1,1,%g1
434
435 stx %o0,[$tp] ! tp[j-1]=
1c3d2b94 436
1c3d2b94
AP
437 faddd $ahic,$nhic,$dota ! $nhic
438 faddd $ahid,$nhid,$dotb ! $nhid
439
440 faddd $nloc,$nhia,$nloc
441 faddd $nlod,$nhib,$nlod
442
443 fdtox $nloa,$nloa
444 fdtox $nlob,$nlob
445 fdtox $nloc,$nloc
446 fdtox $nlod,$nlod
447
448 std $nloa,[%sp+$bias+$frame+0]
449 std $nlob,[%sp+$bias+$frame+8]
450 std $nloc,[%sp+$bias+$frame+16]
451 std $nlod,[%sp+$bias+$frame+24]
452
aa2be094
AP
453 addcc $j,8,$j
454 bnz,pt %icc,.L1st
bcb43bb3 455 add $tp,8,$tp
1c3d2b94
AP
456\f
457.L1stskip:
ebae8092
AP
458 fdtox $dota,$dota
459 fdtox $dotb,$dotb
460
1c3d2b94
AP
461 ldx [%sp+$bias+$frame+0],%o0
462 ldx [%sp+$bias+$frame+8],%o1
463 ldx [%sp+$bias+$frame+16],%o2
464 ldx [%sp+$bias+$frame+24],%o3
465
466 srlx %o0,16,%o7
ebae8092 467 std $dota,[%sp+$bias+$frame+32]
1c3d2b94 468 add %o7,%o1,%o1
ebae8092 469 std $dotb,[%sp+$bias+$frame+40]
1c3d2b94
AP
470 srlx %o1,16,%o7
471 add %o7,%o2,%o2
472 srlx %o2,16,%o7
473 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
474 and %o0,$mask,%o0
475 and %o1,$mask,%o1
476 and %o2,$mask,%o2
477 sllx %o1,16,%o1
478 sllx %o2,32,%o2
479 sllx %o3,48,%o7
480 or %o1,%o0,%o0
481 or %o2,%o0,%o0
482 or %o7,%o0,%o0 ! 64-bit result
ebae8092 483 ldx [%sp+$bias+$frame+32],%o4
1c3d2b94 484 addcc %g1,%o0,%o0
ebae8092 485 ldx [%sp+$bias+$frame+40],%o5
1c3d2b94
AP
486 srlx %o3,16,%g1 ! 34-bit carry
487 bcs,a %xcc,.+8
488 add %g1,1,%g1
489
490 stx %o0,[$tp] ! tp[j-1]=
491 add $tp,8,$tp
bcb43bb3 492
ebae8092
AP
493 srlx %o4,16,%o7
494 add %o7,%o5,%o5
495 and %o4,$mask,%o4
496 sllx %o5,16,%o7
497 or %o7,%o4,%o4
498 addcc %g1,%o4,%o4
499 srlx %o5,48,%g1
bcb43bb3
AP
500 bcs,a %xcc,.+8
501 add %g1,1,%g1
502
503 mov %g1,$carry
ebae8092 504 stx %o4,[$tp] ! tp[num-1]=
bcb43bb3
AP
505\f
506 ba .Louter
507 add $i,8,$i
508.align 32
509.Louter:
6df8c74d 510 sub %g0,$num,$j ! j=-num
bcb43bb3
AP
511 add %sp,$bias+$frame+$locals,$tp
512
513 add $bp,$i,%o4
6df8c74d 514
bcb43bb3 515 ldx [$bp+$i],%o0 ! bp[i]
bcb43bb3 516 ldx [$ap+$j],%o1 ! ap[0]
6df8c74d
AP
517 sllx %o0,32,%g1
518 sllx %o1,32,%g5
519 srlx %o0,32,%o0
520 srlx %o1,32,%o1
521 or %g1,%o0,%o0
522 or %g5,%o1,%o1
523
bcb43bb3
AP
524 ldx [$tp],%o2 ! tp[0]
525 mulx %o1,%o0,%o0
526 addcc %o2,%o0,%o0
527 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
aa2be094 528 stx %o0,[%sp+$bias+$frame+0]
bcb43bb3 529
bcb43bb3 530 ! transfer b[i] to FPU as 4x16-bit values
6df8c74d
AP
531 ldda [%o4+2]%asi,$ba
532 ldda [%o4+0]%asi,$bb
533 ldda [%o4+6]%asi,$bc
534 ldda [%o4+4]%asi,$bd
bcb43bb3
AP
535
536 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
aa2be094 537 ldda [%sp+$bias+$frame+6]%asi,$na
bcb43bb3 538 fxtod $ba,$ba
aa2be094 539 ldda [%sp+$bias+$frame+4]%asi,$nb
bcb43bb3 540 fxtod $bb,$bb
aa2be094 541 ldda [%sp+$bias+$frame+2]%asi,$nc
bcb43bb3 542 fxtod $bc,$bc
aa2be094 543 ldda [%sp+$bias+$frame+0]%asi,$nd
bcb43bb3
AP
544 fxtod $bd,$bd
545 ldd [$ap_l+$j],$alo ! load a[j] in double format
546 fxtod $na,$na
547 ldd [$ap_h+$j],$ahi
548 fxtod $nb,$nb
549 ldd [$np_l+$j],$nlo ! load n[j] in double format
550 fxtod $nc,$nc
551 ldd [$np_h+$j],$nhi
552 fxtod $nd,$nd
553
aa2be094
AP
554 fmuld $alo,$ba,$aloa
555 fmuld $nlo,$na,$nloa
556 fmuld $alo,$bb,$alob
557 fmuld $nlo,$nb,$nlob
558 fmuld $alo,$bc,$aloc
aa2be094 559 faddd $aloa,$nloa,$nloa
6df8c74d 560 fmuld $nlo,$nc,$nloc
aa2be094 561 fmuld $alo,$bd,$alod
aa2be094 562 faddd $alob,$nlob,$nlob
6df8c74d 563 fmuld $nlo,$nd,$nlod
aa2be094 564 fmuld $ahi,$ba,$ahia
aa2be094 565 faddd $aloc,$nloc,$nloc
6df8c74d 566 fmuld $nhi,$na,$nhia
aa2be094 567 fmuld $ahi,$bb,$ahib
aa2be094 568 faddd $alod,$nlod,$nlod
6df8c74d 569 fmuld $nhi,$nb,$nhib
aa2be094 570 fmuld $ahi,$bc,$ahic
aa2be094 571 faddd $ahia,$nhia,$nhia
6df8c74d 572 fmuld $nhi,$nc,$nhic
aa2be094 573 fmuld $ahi,$bd,$ahid
6df8c74d 574 faddd $ahib,$nhib,$nhib
aa2be094 575 fmuld $nhi,$nd,$nhid
bcb43bb3 576
bcb43bb3
AP
577 faddd $ahic,$nhic,$dota ! $nhic
578 faddd $ahid,$nhid,$dotb ! $nhid
579
580 faddd $nloc,$nhia,$nloc
581 faddd $nlod,$nhib,$nlod
582
583 fdtox $nloa,$nloa
584 fdtox $nlob,$nlob
585 fdtox $nloc,$nloc
586 fdtox $nlod,$nlod
587
588 std $nloa,[%sp+$bias+$frame+0]
589 std $nlob,[%sp+$bias+$frame+8]
590 std $nloc,[%sp+$bias+$frame+16]
2e21922e 591 add $j,8,$j
bcb43bb3 592 std $nlod,[%sp+$bias+$frame+24]
2e21922e
AP
593\f
594 ldd [$ap_l+$j],$alo ! load a[j] in double format
595 ldd [$ap_h+$j],$ahi
596 ldd [$np_l+$j],$nlo ! load n[j] in double format
597 ldd [$np_h+$j],$nhi
598
599 fmuld $alo,$ba,$aloa
600 fmuld $nlo,$na,$nloa
601 fmuld $alo,$bb,$alob
602 fmuld $nlo,$nb,$nlob
603 fmuld $alo,$bc,$aloc
bcb43bb3 604 ldx [%sp+$bias+$frame+0],%o0
2e21922e
AP
605 faddd $aloa,$nloa,$nloa
606 fmuld $nlo,$nc,$nloc
bcb43bb3 607 ldx [%sp+$bias+$frame+8],%o1
2e21922e 608 fmuld $alo,$bd,$alod
bcb43bb3 609 ldx [%sp+$bias+$frame+16],%o2
2e21922e
AP
610 faddd $alob,$nlob,$nlob
611 fmuld $nlo,$nd,$nlod
bcb43bb3 612 ldx [%sp+$bias+$frame+24],%o3
2e21922e 613 fmuld $ahi,$ba,$ahia
bcb43bb3
AP
614
615 srlx %o0,16,%o7
2e21922e
AP
616 faddd $aloc,$nloc,$nloc
617 fmuld $nhi,$na,$nhia
bcb43bb3 618 add %o7,%o1,%o1
2e21922e 619 fmuld $ahi,$bb,$ahib
bcb43bb3 620 srlx %o1,16,%o7
2e21922e
AP
621 faddd $alod,$nlod,$nlod
622 fmuld $nhi,$nb,$nhib
bcb43bb3 623 add %o7,%o2,%o2
2e21922e 624 fmuld $ahi,$bc,$ahic
bcb43bb3 625 srlx %o2,16,%o7
2e21922e
AP
626 faddd $ahia,$nhia,$nhia
627 fmuld $nhi,$nc,$nhic
bcb43bb3
AP
628 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
629 ! why?
630 and %o0,$mask,%o0
2e21922e 631 fmuld $ahi,$bd,$ahid
bcb43bb3
AP
632 and %o1,$mask,%o1
633 and %o2,$mask,%o2
2e21922e
AP
634 faddd $ahib,$nhib,$nhib
635 fmuld $nhi,$nd,$nhid
bcb43bb3 636 sllx %o1,16,%o1
2e21922e 637 faddd $dota,$nloa,$nloa
bcb43bb3 638 sllx %o2,32,%o2
2e21922e 639 faddd $dotb,$nlob,$nlob
bcb43bb3
AP
640 sllx %o3,48,%o7
641 or %o1,%o0,%o0
2e21922e 642 faddd $ahic,$nhic,$dota ! $nhic
bcb43bb3 643 or %o2,%o0,%o0
2e21922e 644 faddd $ahid,$nhid,$dotb ! $nhid
bcb43bb3
AP
645 or %o7,%o0,%o0 ! 64-bit result
646 ldx [$tp],%o7
2e21922e 647 faddd $nloc,$nhia,$nloc
bcb43bb3
AP
648 addcc %o7,%o0,%o0
649 ! end-of-why?
2e21922e 650 faddd $nlod,$nhib,$nlod
bcb43bb3 651 srlx %o3,16,%g1 ! 34-bit carry
2e21922e 652 fdtox $nloa,$nloa
bcb43bb3
AP
653 bcs,a %xcc,.+8
654 add %g1,1,%g1
bcb43bb3 655
bcb43bb3
AP
656 fdtox $nlob,$nlob
657 fdtox $nloc,$nloc
658 fdtox $nlod,$nlod
659
660 std $nloa,[%sp+$bias+$frame+0]
661 std $nlob,[%sp+$bias+$frame+8]
2e21922e 662 addcc $j,8,$j
bcb43bb3 663 std $nloc,[%sp+$bias+$frame+16]
2e21922e 664 bz,pn %icc,.Linnerskip
bcb43bb3 665 std $nlod,[%sp+$bias+$frame+24]
1c3d2b94 666\f
ebae8092
AP
667 ba .Linner
668 nop
669.align 32
1c3d2b94 670.Linner:
2e21922e
AP
671 ldd [$ap_l+$j],$alo ! load a[j] in double format
672 ldd [$ap_h+$j],$ahi
673 ldd [$np_l+$j],$nlo ! load n[j] in double format
674 ldd [$np_h+$j],$nhi
675
676 fmuld $alo,$ba,$aloa
677 fmuld $nlo,$na,$nloa
678 fmuld $alo,$bb,$alob
679 fmuld $nlo,$nb,$nlob
680 fmuld $alo,$bc,$aloc
bcb43bb3 681 ldx [%sp+$bias+$frame+0],%o0
2e21922e
AP
682 faddd $aloa,$nloa,$nloa
683 fmuld $nlo,$nc,$nloc
bcb43bb3 684 ldx [%sp+$bias+$frame+8],%o1
2e21922e 685 fmuld $alo,$bd,$alod
bcb43bb3 686 ldx [%sp+$bias+$frame+16],%o2
2e21922e
AP
687 faddd $alob,$nlob,$nlob
688 fmuld $nlo,$nd,$nlod
bcb43bb3 689 ldx [%sp+$bias+$frame+24],%o3
2e21922e 690 fmuld $ahi,$ba,$ahia
bcb43bb3
AP
691
692 srlx %o0,16,%o7
2e21922e
AP
693 faddd $aloc,$nloc,$nloc
694 fmuld $nhi,$na,$nhia
bcb43bb3 695 add %o7,%o1,%o1
2e21922e 696 fmuld $ahi,$bb,$ahib
bcb43bb3 697 srlx %o1,16,%o7
2e21922e
AP
698 faddd $alod,$nlod,$nlod
699 fmuld $nhi,$nb,$nhib
bcb43bb3 700 add %o7,%o2,%o2
2e21922e 701 fmuld $ahi,$bc,$ahic
bcb43bb3 702 srlx %o2,16,%o7
2e21922e
AP
703 faddd $ahia,$nhia,$nhia
704 fmuld $nhi,$nc,$nhic
bcb43bb3
AP
705 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
706 and %o0,$mask,%o0
2e21922e 707 fmuld $ahi,$bd,$ahid
bcb43bb3
AP
708 and %o1,$mask,%o1
709 and %o2,$mask,%o2
2e21922e
AP
710 faddd $ahib,$nhib,$nhib
711 fmuld $nhi,$nd,$nhid
bcb43bb3 712 sllx %o1,16,%o1
2e21922e 713 faddd $dota,$nloa,$nloa
bcb43bb3 714 sllx %o2,32,%o2
2e21922e 715 faddd $dotb,$nlob,$nlob
bcb43bb3
AP
716 sllx %o3,48,%o7
717 or %o1,%o0,%o0
2e21922e 718 faddd $ahic,$nhic,$dota ! $nhic
bcb43bb3 719 or %o2,%o0,%o0
2e21922e 720 faddd $ahid,$nhid,$dotb ! $nhid
bcb43bb3 721 or %o7,%o0,%o0 ! 64-bit result
2e21922e 722 faddd $nloc,$nhia,$nloc
bcb43bb3 723 addcc %g1,%o0,%o0
ebae8092 724 ldx [$tp+8],%o7 ! tp[j]
2e21922e 725 faddd $nlod,$nhib,$nlod
bcb43bb3 726 srlx %o3,16,%g1 ! 34-bit carry
2e21922e 727 fdtox $nloa,$nloa
bcb43bb3
AP
728 bcs,a %xcc,.+8
729 add %g1,1,%g1
2e21922e 730 fdtox $nlob,$nlob
bcb43bb3 731 addcc %o7,%o0,%o0
2e21922e 732 fdtox $nloc,$nloc
bcb43bb3
AP
733 bcs,a %xcc,.+8
734 add %g1,1,%g1
735
736 stx %o0,[$tp] ! tp[j-1]
2e21922e 737 fdtox $nlod,$nlod
1c3d2b94
AP
738
739 std $nloa,[%sp+$bias+$frame+0]
740 std $nlob,[%sp+$bias+$frame+8]
741 std $nloc,[%sp+$bias+$frame+16]
aa2be094 742 addcc $j,8,$j
2e21922e 743 std $nlod,[%sp+$bias+$frame+24]
aa2be094 744 bnz,pt %icc,.Linner
bcb43bb3 745 add $tp,8,$tp
1c3d2b94
AP
746\f
747.Linnerskip:
2e21922e
AP
748 fdtox $dota,$dota
749 fdtox $dotb,$dotb
750
1c3d2b94
AP
751 ldx [%sp+$bias+$frame+0],%o0
752 ldx [%sp+$bias+$frame+8],%o1
753 ldx [%sp+$bias+$frame+16],%o2
754 ldx [%sp+$bias+$frame+24],%o3
755
756 srlx %o0,16,%o7
2e21922e 757 std $dota,[%sp+$bias+$frame+32]
1c3d2b94 758 add %o7,%o1,%o1
2e21922e 759 std $dotb,[%sp+$bias+$frame+40]
1c3d2b94
AP
760 srlx %o1,16,%o7
761 add %o7,%o2,%o2
762 srlx %o2,16,%o7
763 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
764 and %o0,$mask,%o0
765 and %o1,$mask,%o1
766 and %o2,$mask,%o2
767 sllx %o1,16,%o1
768 sllx %o2,32,%o2
769 sllx %o3,48,%o7
770 or %o1,%o0,%o0
771 or %o2,%o0,%o0
2e21922e 772 ldx [%sp+$bias+$frame+32],%o4
1c3d2b94 773 or %o7,%o0,%o0 ! 64-bit result
2e21922e 774 ldx [%sp+$bias+$frame+40],%o5
1c3d2b94 775 addcc %g1,%o0,%o0
2e21922e 776 ldx [$tp+8],%o7 ! tp[j]
1c3d2b94
AP
777 srlx %o3,16,%g1 ! 34-bit carry
778 bcs,a %xcc,.+8
779 add %g1,1,%g1
780
1c3d2b94
AP
781 addcc %o7,%o0,%o0
782 bcs,a %xcc,.+8
783 add %g1,1,%g1
784
785 stx %o0,[$tp] ! tp[j-1]
786 add $tp,8,$tp
bcb43bb3 787
2e21922e
AP
788 srlx %o4,16,%o7
789 add %o7,%o5,%o5
790 and %o4,$mask,%o4
791 sllx %o5,16,%o7
792 or %o7,%o4,%o4
793 addcc %g1,%o4,%o4
794 srlx %o5,48,%g1
bcb43bb3
AP
795 bcs,a %xcc,.+8
796 add %g1,1,%g1
797
2e21922e
AP
798 addcc $carry,%o4,%o4
799 stx %o4,[$tp] ! tp[num-1]
bcb43bb3
AP
800 mov %g1,$carry
801 bcs,a %xcc,.+8
802 add $carry,1,$carry
803
aa2be094
AP
804 addcc $i,8,$i
805 bnz %icc,.Louter
bcb43bb3
AP
806 nop
807\f
aa2be094 808 sub %g0,$num,%o7 ! n=-num
bcb43bb3
AP
809 cmp $carry,0 ! clears %icc.c
810 bne,pn %icc,.Lsub
aa2be094 811 add $tp,8,$tp ! adjust tp to point at the end
bcb43bb3
AP
812
813 ld [$tp-8],%o0
6df8c74d 814 ld [$np-4],%o1
aa2be094 815 cmp %o0,%o1 ! compare topmost words
bcb43bb3
AP
816 bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
817 nop
818
819.align 32,0x1000000
820.Lsub:
aa2be094
AP
821 ldd [$tp+%o7],%o0
822 ldd [$np+%o7],%o2
aa2be094
AP
823 subccc %o1,%o2,%o2
824 subccc %o0,%o3,%o3
aa2be094
AP
825 std %o2,[$rp+%o7]
826 add %o7,8,%o7
827 brnz,pt %o7,.Lsub
bcb43bb3
AP
828 nop
829 subccc $carry,0,$carry
aa2be094 830 bcc,pt %icc,.Lzap
6df8c74d 831 sub %g0,$num,%o7 ! n=-num
bcb43bb3
AP
832
833.align 16,0x1000000
834.Lcopy:
aa2be094 835 ldx [$tp+%o7],%o0
aa2be094
AP
836 srlx %o0,32,%o1
837 std %o0,[$rp+%o7]
aa2be094
AP
838 add %o7,8,%o7
839 brnz,pt %o7,.Lcopy
bcb43bb3
AP
840 nop
841 ba .Lzap
6df8c74d 842 sub %g0,$num,%o7 ! n=-num
bcb43bb3
AP
843
844.align 32
845.Lzap:
aa2be094
AP
846 stx %g0,[$tp+%o7]
847 stx %g0,[$ap_l+%o7]
848 stx %g0,[$ap_h+%o7]
849 stx %g0,[$np_l+%o7]
850 stx %g0,[$np_h+%o7]
851 add %o7,8,%o7
852 brnz,pt %o7,.Lzap
bcb43bb3
AP
853 nop
854
855 ldx [%sp+$bias+$frame+48],%o7
856 wr %g0,%o7,%asi ! restore %asi
857
858 mov 1,%i0
aa2be094 859.Lret:
bcb43bb3
AP
860 ret
861 restore
862.type $fname,#function
863.size $fname,(.-$fname)
864___
865
866$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3b4a0225
AP
867
868# Below substitution makes it possible to compile without demanding
869# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
870# dare to do this, because VIS capability is detected at run-time now
871# and this routine is not called on CPU not capable to execute it. Do
872# note that fzeros is not the only VIS dependency! Another dependency
873# is implicit and is just _a_ numerical value loaded to %asi register,
874# which assembler can't recognize as VIS specific...
875$code =~ s/fzeros\s+%f([0-9]+)/
876 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
877 /gem;
878
bcb43bb3 879print $code;
3b4a0225 880# flush
bcb43bb3 881close STDOUT;