]>
Commit | Line | Data |
---|---|---|
bcb43bb3 AP |
1 | #!/usr/bin/env perl |
2 | ||
3 | # ==================================================================== | |
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
2e21922e AP |
5 | # project. The module is, however, dual licensed under OpenSSL and |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
bcb43bb3 AP |
8 | # ==================================================================== |
9 | ||
aa2be094 AP |
10 | # October 2005 |
11 | # | |
bcb43bb3 AP |
12 | # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? |
13 | # Because unlike integer multiplier, which simply stalls whole CPU, | |
14 | # FPU is fully pipelined and can effectively emit 48 bit partial | |
15 | # product every cycle. Why not blended SPARC v9? One can argue that | |
16 | # making this module dependent on UltraSPARC VIS extension limits its | |
a4d729f3 AP |
17 | # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) |
18 | # implementations from compatibility matrix. But the rest, whole Sun | |
19 | # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support | |
20 | # VIS extension instructions used in this module. This is considered | |
73b979e6 AP |
21 | # good enough to not care about HAL SPARC64 users [if any] who have |
22 | # integer-only pure SPARCv9 module to "fall down" to. | |
bcb43bb3 AP |
23 | |
24 | # USI&II cores currently exhibit uniform 2x improvement [over pre- | |
25 | # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII | |
26 | # performance improves few percents for shorter keys and worsens few | |
aa2be094 | 27 | # percents for longer keys. This is because USIII integer multiplier |
bcb43bb3 AP |
28 | # is >3x faster than USI&II one, which is harder to match [but see |
29 | # TODO list below]. It should also be noted that SPARC64 V features | |
30 | # out-of-order execution, which *might* mean that integer multiplier | |
a4d729f3 AP |
31 | # is pipelined, which in turn *might* be impossible to match... On |
32 | # additional note, SPARC64 V implements FP Multiply-Add instruction, | |
33 | # which is perfectly usable in this context... In other words, as far | |
73b979e6 | 34 | # as Fujitsu SPARC64 V goes, talk to the author:-) |
aa2be094 | 35 | |
a00e414f AP |
36 | # The implementation implies following "non-natural" limitations on |
37 | # input arguments: | |
aa2be094 AP |
38 | # - num may not be less than 4; |
39 | # - num has to be even; | |
40 | # - ap, bp, rp, np has to be 64-bit aligned [which is not a problem | |
41 | # as long as BIGNUM.d are malloc-ated]; | |
42 | # Failure to meet either condition has no fatal effects, simply | |
43 | # doesn't give any performance gain. | |
44 | ||
bcb43bb3 | 45 | # TODO: |
bcb43bb3 AP |
46 | # - modulo-schedule inner loop for better performance (on in-order |
47 | # execution core such as UltraSPARC this shall result in further | |
48 | # noticeable(!) improvement); | |
49 | # - dedicated squaring procedure[?]; | |
50 | ||
2e21922e AP |
51 | ###################################################################### |
52 | # November 2006 | |
53 | # | |
54 | # Modulo-scheduled inner loops allow to interleave floating point and | |
55 | # integer instructions and minimize Read-After-Write penalties. This | |
56 | # results in *further* 20-50% perfromance improvement [depending on | |
57 | # key length, more for longer keys] on USI&II cores and 30-80% - on | |
58 | # USIII&IV. | |
59 | ||
a00e414f | 60 | $fname="bn_mul_mont_fpu"; |
bcb43bb3 | 61 | $bits=32; |
3b4a0225 | 62 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } |
bcb43bb3 AP |
63 | |
64 | if ($bits==64) { | |
65 | $bias=2047; | |
66 | $frame=192; | |
67 | } else { | |
68 | $bias=0; | |
69 | $frame=128; # 96 rounded up to largest known cache-line | |
70 | } | |
71 | $locals=64; | |
72 | ||
73 | # In order to provide for 32-/64-bit ABI duality, I keep integers wider | |
74 | # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used | |
75 | # exclusively for pointers, indexes and other small values... | |
76 | # int bn_mul_mont( | |
77 | $rp="%i0"; # BN_ULONG *rp, | |
78 | $ap="%i1"; # const BN_ULONG *ap, | |
79 | $bp="%i2"; # const BN_ULONG *bp, | |
80 | $np="%i3"; # const BN_ULONG *np, | |
4d524040 | 81 | $n0="%i4"; # const BN_ULONG *n0, |
bcb43bb3 AP |
82 | $num="%i5"; # int num); |
83 | ||
aa2be094 | 84 | $tp="%l0"; # t[num] |
bcb43bb3 AP |
85 | $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved |
86 | $ap_h="%l2"; # to these four vectors as double-precision FP values. | |
87 | $np_l="%l3"; # This way a bunch of fxtods are eliminated in second | |
88 | $np_h="%l4"; # loop and L1-cache aliasing is minimized... | |
89 | $i="%l5"; | |
90 | $j="%l6"; | |
91 | $mask="%l7"; # 16-bit mask, 0xffff | |
92 | ||
aa2be094 AP |
93 | $n0="%g4"; # reassigned(!) to "64-bit" register |
94 | $carry="%i4"; # %i4 reused(!) for a carry bit | |
bcb43bb3 AP |
95 | |
96 | # FP register naming chart | |
97 | # | |
98 | # ..HILO | |
99 | # dcba | |
100 | # -------- | |
101 | # LOa | |
102 | # LOb | |
103 | # LOc | |
104 | # LOd | |
105 | # HIa | |
106 | # HIb | |
107 | # HIc | |
108 | # HId | |
109 | # ..a | |
110 | # ..b | |
111 | $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; | |
112 | $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; | |
113 | $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; | |
114 | $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; | |
115 | ||
116 | $dota="%f24"; $dotb="%f26"; | |
117 | ||
118 | $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; | |
119 | $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; | |
120 | $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; | |
121 | $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; | |
122 | ||
123 | $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load | |
124 | ||
125 | $code=<<___; | |
126 | .ident "UltraSPARC Montgomery multiply by <appro\@fy.chalmers.se>" | |
127 | .section ".text",#alloc,#execinstr | |
128 | ||
129 | .global $fname | |
130 | .align 32 | |
131 | $fname: | |
aa2be094 | 132 | save %sp,-$frame-$locals,%sp |
bcb43bb3 | 133 | sethi %hi(0xffff),$mask |
bcb43bb3 | 134 | or $mask,%lo(0xffff),$mask |
6df8c74d | 135 | |
aa2be094 AP |
136 | cmp $num,4 |
137 | bl,a,pn %icc,.Lret | |
138 | clr %i0 | |
139 | andcc $num,1,%g0 ! $num has to be even... | |
140 | bnz,a,pn %icc,.Lret | |
141 | clr %i0 ! signal "unsupported input value" | |
142 | or $bp,$ap,%l0 | |
143 | srl $num,1,$num | |
144 | or $rp,$np,%l1 | |
145 | or %l0,%l1,%l0 | |
146 | andcc %l0,7,%g0 ! ...and pointers has to be 8-byte aligned | |
147 | bnz,a,pn %icc,.Lret | |
148 | clr %i0 ! signal "unsupported input value" | |
149 | ld [%i4+0],$n0 ! $n0 reassigned, remember? | |
150 | ld [%i4+4],%o0 | |
151 | sllx %o0,32,%o0 | |
152 | or %o0,$n0,$n0 ! $n0=n0[1].n0[0] | |
6df8c74d | 153 | |
aa2be094 | 154 | sll $num,3,$num ! num*=8 |
bcb43bb3 AP |
155 | |
156 | add %sp,$bias,%o0 ! real top of stack | |
157 | sll $num,2,%o1 | |
158 | add %o1,$num,%o1 ! %o1=num*5 | |
159 | sub %o0,%o1,%o0 | |
bcb43bb3 | 160 | and %o0,-2048,%o0 ! optimize TLB utilization |
aa2be094 | 161 | sub %o0,$bias,%sp ! alloca(5*num*8) |
bcb43bb3 | 162 | |
aa2be094 | 163 | rd %asi,%o7 ! save %asi |
bcb43bb3 AP |
164 | add %sp,$bias+$frame+$locals,$tp |
165 | add $tp,$num,$ap_l | |
aa2be094 | 166 | add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! |
bcb43bb3 AP |
167 | add $ap_l,$num,$ap_h |
168 | add $ap_h,$num,$np_l | |
169 | add $np_l,$num,$np_h | |
170 | ||
171 | wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads | |
172 | ||
173 | add $rp,$num,$rp ! readjust input pointers to point | |
174 | add $ap,$num,$ap ! at the ends too... | |
175 | add $bp,$num,$bp | |
176 | add $np,$num,$np | |
177 | ||
aa2be094 | 178 | stx %o7,[%sp+$bias+$frame+48] ! save %asi |
bcb43bb3 | 179 | \f |
6df8c74d AP |
180 | sub %g0,$num,$i ! i=-num |
181 | sub %g0,$num,$j ! j=-num | |
bcb43bb3 AP |
182 | |
183 | add $ap,$j,%o3 | |
184 | add $bp,$i,%o4 | |
6df8c74d | 185 | |
bcb43bb3 | 186 | ldx [$bp+$i],%o0 ! bp[0] |
bcb43bb3 | 187 | ldx [$ap+$j],%o1 ! ap[0] |
6df8c74d AP |
188 | sllx %o0,32,%g1 |
189 | sllx %o1,32,%g5 | |
190 | srlx %o0,32,%o0 | |
191 | srlx %o1,32,%o1 | |
192 | or %g1,%o0,%o0 | |
193 | or %g5,%o1,%o1 | |
194 | ||
aa2be094 | 195 | add $np,$j,%o5 |
bcb43bb3 AP |
196 | |
197 | mulx %o1,%o0,%o0 ! ap[0]*bp[0] | |
198 | mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 | |
aa2be094 | 199 | stx %o0,[%sp+$bias+$frame+0] |
bcb43bb3 | 200 | |
6df8c74d | 201 | ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words |
aa2be094 | 202 | fzeros $alo |
6df8c74d | 203 | ld [%o3+4],$ahi_ |
aa2be094 | 204 | fzeros $ahi |
6df8c74d | 205 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words |
aa2be094 | 206 | fzeros $nlo |
6df8c74d | 207 | ld [%o5+4],$nhi_ |
aa2be094 | 208 | fzeros $nhi |
bcb43bb3 AP |
209 | |
210 | ! transfer b[i] to FPU as 4x16-bit values | |
6df8c74d | 211 | ldda [%o4+2]%asi,$ba |
bcb43bb3 | 212 | fxtod $alo,$alo |
6df8c74d | 213 | ldda [%o4+0]%asi,$bb |
bcb43bb3 | 214 | fxtod $ahi,$ahi |
6df8c74d | 215 | ldda [%o4+6]%asi,$bc |
bcb43bb3 | 216 | fxtod $nlo,$nlo |
6df8c74d | 217 | ldda [%o4+4]%asi,$bd |
bcb43bb3 AP |
218 | fxtod $nhi,$nhi |
219 | ||
220 | ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values | |
aa2be094 | 221 | ldda [%sp+$bias+$frame+6]%asi,$na |
bcb43bb3 | 222 | fxtod $ba,$ba |
aa2be094 | 223 | ldda [%sp+$bias+$frame+4]%asi,$nb |
bcb43bb3 | 224 | fxtod $bb,$bb |
aa2be094 | 225 | ldda [%sp+$bias+$frame+2]%asi,$nc |
bcb43bb3 | 226 | fxtod $bc,$bc |
aa2be094 | 227 | ldda [%sp+$bias+$frame+0]%asi,$nd |
bcb43bb3 AP |
228 | fxtod $bd,$bd |
229 | ||
230 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | |
231 | fxtod $na,$na | |
232 | std $ahi,[$ap_h+$j] | |
233 | fxtod $nb,$nb | |
234 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | |
235 | fxtod $nc,$nc | |
236 | std $nhi,[$np_h+$j] | |
237 | fxtod $nd,$nd | |
238 | ||
aa2be094 AP |
239 | fmuld $alo,$ba,$aloa |
240 | fmuld $nlo,$na,$nloa | |
241 | fmuld $alo,$bb,$alob | |
242 | fmuld $nlo,$nb,$nlob | |
243 | fmuld $alo,$bc,$aloc | |
aa2be094 | 244 | faddd $aloa,$nloa,$nloa |
6df8c74d | 245 | fmuld $nlo,$nc,$nloc |
aa2be094 | 246 | fmuld $alo,$bd,$alod |
aa2be094 | 247 | faddd $alob,$nlob,$nlob |
6df8c74d | 248 | fmuld $nlo,$nd,$nlod |
aa2be094 | 249 | fmuld $ahi,$ba,$ahia |
aa2be094 | 250 | faddd $aloc,$nloc,$nloc |
6df8c74d | 251 | fmuld $nhi,$na,$nhia |
aa2be094 | 252 | fmuld $ahi,$bb,$ahib |
aa2be094 | 253 | faddd $alod,$nlod,$nlod |
6df8c74d | 254 | fmuld $nhi,$nb,$nhib |
aa2be094 | 255 | fmuld $ahi,$bc,$ahic |
aa2be094 | 256 | faddd $ahia,$nhia,$nhia |
6df8c74d | 257 | fmuld $nhi,$nc,$nhic |
aa2be094 | 258 | fmuld $ahi,$bd,$ahid |
6df8c74d | 259 | faddd $ahib,$nhib,$nhib |
aa2be094 | 260 | fmuld $nhi,$nd,$nhid |
bcb43bb3 | 261 | |
bcb43bb3 AP |
262 | faddd $ahic,$nhic,$dota ! $nhic |
263 | faddd $ahid,$nhid,$dotb ! $nhid | |
264 | ||
265 | faddd $nloc,$nhia,$nloc | |
266 | faddd $nlod,$nhib,$nlod | |
267 | ||
268 | fdtox $nloa,$nloa | |
269 | fdtox $nlob,$nlob | |
270 | fdtox $nloc,$nloc | |
271 | fdtox $nlod,$nlod | |
272 | ||
273 | std $nloa,[%sp+$bias+$frame+0] | |
2e21922e | 274 | add $j,8,$j |
bcb43bb3 | 275 | std $nlob,[%sp+$bias+$frame+8] |
2e21922e | 276 | add $ap,$j,%o4 |
bcb43bb3 | 277 | std $nloc,[%sp+$bias+$frame+16] |
2e21922e | 278 | add $np,$j,%o5 |
bcb43bb3 | 279 | std $nlod,[%sp+$bias+$frame+24] |
bcb43bb3 | 280 | \f |
1c3d2b94 | 281 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words |
aa2be094 | 282 | fzeros $alo |
1c3d2b94 | 283 | ld [%o4+4],$ahi_ |
aa2be094 | 284 | fzeros $ahi |
1c3d2b94 | 285 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words |
aa2be094 | 286 | fzeros $nlo |
1c3d2b94 | 287 | ld [%o5+4],$nhi_ |
aa2be094 | 288 | fzeros $nhi |
bcb43bb3 AP |
289 | |
290 | fxtod $alo,$alo | |
291 | fxtod $ahi,$ahi | |
292 | fxtod $nlo,$nlo | |
293 | fxtod $nhi,$nhi | |
294 | ||
2e21922e | 295 | ldx [%sp+$bias+$frame+0],%o0 |
aa2be094 | 296 | fmuld $alo,$ba,$aloa |
2e21922e | 297 | ldx [%sp+$bias+$frame+8],%o1 |
aa2be094 | 298 | fmuld $nlo,$na,$nloa |
2e21922e | 299 | ldx [%sp+$bias+$frame+16],%o2 |
aa2be094 | 300 | fmuld $alo,$bb,$alob |
2e21922e | 301 | ldx [%sp+$bias+$frame+24],%o3 |
aa2be094 | 302 | fmuld $nlo,$nb,$nlob |
2e21922e AP |
303 | |
304 | srlx %o0,16,%o7 | |
305 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | |
aa2be094 | 306 | fmuld $alo,$bc,$aloc |
2e21922e AP |
307 | add %o7,%o1,%o1 |
308 | std $ahi,[$ap_h+$j] | |
309 | faddd $aloa,$nloa,$nloa | |
6df8c74d | 310 | fmuld $nlo,$nc,$nloc |
2e21922e AP |
311 | srlx %o1,16,%o7 |
312 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | |
aa2be094 | 313 | fmuld $alo,$bd,$alod |
2e21922e AP |
314 | add %o7,%o2,%o2 |
315 | std $nhi,[$np_h+$j] | |
316 | faddd $alob,$nlob,$nlob | |
6df8c74d | 317 | fmuld $nlo,$nd,$nlod |
2e21922e | 318 | srlx %o2,16,%o7 |
aa2be094 | 319 | fmuld $ahi,$ba,$ahia |
2e21922e AP |
320 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] |
321 | faddd $aloc,$nloc,$nloc | |
6df8c74d | 322 | fmuld $nhi,$na,$nhia |
2e21922e AP |
323 | !and %o0,$mask,%o0 |
324 | !and %o1,$mask,%o1 | |
325 | !and %o2,$mask,%o2 | |
326 | !sllx %o1,16,%o1 | |
327 | !sllx %o2,32,%o2 | |
328 | !sllx %o3,48,%o7 | |
329 | !or %o1,%o0,%o0 | |
330 | !or %o2,%o0,%o0 | |
331 | !or %o7,%o0,%o0 ! 64-bit result | |
332 | srlx %o3,16,%g1 ! 34-bit carry | |
aa2be094 | 333 | fmuld $ahi,$bb,$ahib |
2e21922e | 334 | |
aa2be094 | 335 | faddd $alod,$nlod,$nlod |
6df8c74d | 336 | fmuld $nhi,$nb,$nhib |
aa2be094 | 337 | fmuld $ahi,$bc,$ahic |
aa2be094 | 338 | faddd $ahia,$nhia,$nhia |
6df8c74d | 339 | fmuld $nhi,$nc,$nhic |
aa2be094 | 340 | fmuld $ahi,$bd,$ahid |
aa2be094 | 341 | faddd $ahib,$nhib,$nhib |
6df8c74d | 342 | fmuld $nhi,$nd,$nhid |
bcb43bb3 AP |
343 | |
344 | faddd $dota,$nloa,$nloa | |
345 | faddd $dotb,$nlob,$nlob | |
346 | faddd $ahic,$nhic,$dota ! $nhic | |
347 | faddd $ahid,$nhid,$dotb ! $nhid | |
348 | ||
349 | faddd $nloc,$nhia,$nloc | |
350 | faddd $nlod,$nhib,$nlod | |
351 | ||
352 | fdtox $nloa,$nloa | |
353 | fdtox $nlob,$nlob | |
354 | fdtox $nloc,$nloc | |
355 | fdtox $nlod,$nlod | |
356 | ||
357 | std $nloa,[%sp+$bias+$frame+0] | |
358 | std $nlob,[%sp+$bias+$frame+8] | |
2e21922e | 359 | addcc $j,8,$j |
bcb43bb3 | 360 | std $nloc,[%sp+$bias+$frame+16] |
2e21922e | 361 | bz,pn %icc,.L1stskip |
bcb43bb3 | 362 | std $nlod,[%sp+$bias+$frame+24] |
1c3d2b94 | 363 | \f |
1c3d2b94 AP |
364 | .align 32,0x1000000 |
365 | .L1st: | |
1c3d2b94 AP |
366 | add $ap,$j,%o4 |
367 | add $np,$j,%o5 | |
368 | ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words | |
369 | fzeros $alo | |
370 | ld [%o4+4],$ahi_ | |
371 | fzeros $ahi | |
372 | ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | |
373 | fzeros $nlo | |
374 | ld [%o5+4],$nhi_ | |
375 | fzeros $nhi | |
376 | ||
377 | fxtod $alo,$alo | |
378 | fxtod $ahi,$ahi | |
379 | fxtod $nlo,$nlo | |
380 | fxtod $nhi,$nhi | |
381 | ||
2e21922e | 382 | ldx [%sp+$bias+$frame+0],%o0 |
1c3d2b94 | 383 | fmuld $alo,$ba,$aloa |
2e21922e | 384 | ldx [%sp+$bias+$frame+8],%o1 |
1c3d2b94 | 385 | fmuld $nlo,$na,$nloa |
2e21922e | 386 | ldx [%sp+$bias+$frame+16],%o2 |
1c3d2b94 | 387 | fmuld $alo,$bb,$alob |
2e21922e | 388 | ldx [%sp+$bias+$frame+24],%o3 |
1c3d2b94 | 389 | fmuld $nlo,$nb,$nlob |
2e21922e AP |
390 | |
391 | srlx %o0,16,%o7 | |
392 | std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | |
1c3d2b94 | 393 | fmuld $alo,$bc,$aloc |
2e21922e AP |
394 | add %o7,%o1,%o1 |
395 | std $ahi,[$ap_h+$j] | |
396 | faddd $aloa,$nloa,$nloa | |
1c3d2b94 | 397 | fmuld $nlo,$nc,$nloc |
2e21922e AP |
398 | srlx %o1,16,%o7 |
399 | std $nlo,[$np_l+$j] ! save smashed np[j] in double format | |
1c3d2b94 | 400 | fmuld $alo,$bd,$alod |
2e21922e AP |
401 | add %o7,%o2,%o2 |
402 | std $nhi,[$np_h+$j] | |
403 | faddd $alob,$nlob,$nlob | |
1c3d2b94 | 404 | fmuld $nlo,$nd,$nlod |
2e21922e | 405 | srlx %o2,16,%o7 |
1c3d2b94 | 406 | fmuld $ahi,$ba,$ahia |
2e21922e AP |
407 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] |
408 | and %o0,$mask,%o0 | |
409 | faddd $aloc,$nloc,$nloc | |
1c3d2b94 | 410 | fmuld $nhi,$na,$nhia |
2e21922e AP |
411 | and %o1,$mask,%o1 |
412 | and %o2,$mask,%o2 | |
1c3d2b94 | 413 | fmuld $ahi,$bb,$ahib |
2e21922e AP |
414 | sllx %o1,16,%o1 |
415 | faddd $alod,$nlod,$nlod | |
1c3d2b94 | 416 | fmuld $nhi,$nb,$nhib |
2e21922e | 417 | sllx %o2,32,%o2 |
1c3d2b94 | 418 | fmuld $ahi,$bc,$ahic |
2e21922e AP |
419 | sllx %o3,48,%o7 |
420 | or %o1,%o0,%o0 | |
421 | faddd $ahia,$nhia,$nhia | |
1c3d2b94 | 422 | fmuld $nhi,$nc,$nhic |
2e21922e | 423 | or %o2,%o0,%o0 |
1c3d2b94 | 424 | fmuld $ahi,$bd,$ahid |
2e21922e AP |
425 | or %o7,%o0,%o0 ! 64-bit result |
426 | faddd $ahib,$nhib,$nhib | |
1c3d2b94 | 427 | fmuld $nhi,$nd,$nhid |
2e21922e AP |
428 | addcc %g1,%o0,%o0 |
429 | faddd $dota,$nloa,$nloa | |
430 | srlx %o3,16,%g1 ! 34-bit carry | |
431 | faddd $dotb,$nlob,$nlob | |
432 | bcs,a %xcc,.+8 | |
433 | add %g1,1,%g1 | |
434 | ||
435 | stx %o0,[$tp] ! tp[j-1]= | |
1c3d2b94 | 436 | |
1c3d2b94 AP |
437 | faddd $ahic,$nhic,$dota ! $nhic |
438 | faddd $ahid,$nhid,$dotb ! $nhid | |
439 | ||
440 | faddd $nloc,$nhia,$nloc | |
441 | faddd $nlod,$nhib,$nlod | |
442 | ||
443 | fdtox $nloa,$nloa | |
444 | fdtox $nlob,$nlob | |
445 | fdtox $nloc,$nloc | |
446 | fdtox $nlod,$nlod | |
447 | ||
448 | std $nloa,[%sp+$bias+$frame+0] | |
449 | std $nlob,[%sp+$bias+$frame+8] | |
450 | std $nloc,[%sp+$bias+$frame+16] | |
451 | std $nlod,[%sp+$bias+$frame+24] | |
452 | ||
aa2be094 AP |
453 | addcc $j,8,$j |
454 | bnz,pt %icc,.L1st | |
bcb43bb3 | 455 | add $tp,8,$tp |
1c3d2b94 AP |
456 | \f |
457 | .L1stskip: | |
ebae8092 AP |
458 | fdtox $dota,$dota |
459 | fdtox $dotb,$dotb | |
460 | ||
1c3d2b94 AP |
461 | ldx [%sp+$bias+$frame+0],%o0 |
462 | ldx [%sp+$bias+$frame+8],%o1 | |
463 | ldx [%sp+$bias+$frame+16],%o2 | |
464 | ldx [%sp+$bias+$frame+24],%o3 | |
465 | ||
466 | srlx %o0,16,%o7 | |
ebae8092 | 467 | std $dota,[%sp+$bias+$frame+32] |
1c3d2b94 | 468 | add %o7,%o1,%o1 |
ebae8092 | 469 | std $dotb,[%sp+$bias+$frame+40] |
1c3d2b94 AP |
470 | srlx %o1,16,%o7 |
471 | add %o7,%o2,%o2 | |
472 | srlx %o2,16,%o7 | |
473 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | |
474 | and %o0,$mask,%o0 | |
475 | and %o1,$mask,%o1 | |
476 | and %o2,$mask,%o2 | |
477 | sllx %o1,16,%o1 | |
478 | sllx %o2,32,%o2 | |
479 | sllx %o3,48,%o7 | |
480 | or %o1,%o0,%o0 | |
481 | or %o2,%o0,%o0 | |
482 | or %o7,%o0,%o0 ! 64-bit result | |
ebae8092 | 483 | ldx [%sp+$bias+$frame+32],%o4 |
1c3d2b94 | 484 | addcc %g1,%o0,%o0 |
ebae8092 | 485 | ldx [%sp+$bias+$frame+40],%o5 |
1c3d2b94 AP |
486 | srlx %o3,16,%g1 ! 34-bit carry |
487 | bcs,a %xcc,.+8 | |
488 | add %g1,1,%g1 | |
489 | ||
490 | stx %o0,[$tp] ! tp[j-1]= | |
491 | add $tp,8,$tp | |
bcb43bb3 | 492 | |
ebae8092 AP |
493 | srlx %o4,16,%o7 |
494 | add %o7,%o5,%o5 | |
495 | and %o4,$mask,%o4 | |
496 | sllx %o5,16,%o7 | |
497 | or %o7,%o4,%o4 | |
498 | addcc %g1,%o4,%o4 | |
499 | srlx %o5,48,%g1 | |
bcb43bb3 AP |
500 | bcs,a %xcc,.+8 |
501 | add %g1,1,%g1 | |
502 | ||
503 | mov %g1,$carry | |
ebae8092 | 504 | stx %o4,[$tp] ! tp[num-1]= |
bcb43bb3 AP |
505 | \f |
506 | ba .Louter | |
507 | add $i,8,$i | |
508 | .align 32 | |
509 | .Louter: | |
6df8c74d | 510 | sub %g0,$num,$j ! j=-num |
bcb43bb3 AP |
511 | add %sp,$bias+$frame+$locals,$tp |
512 | ||
513 | add $bp,$i,%o4 | |
6df8c74d | 514 | |
bcb43bb3 | 515 | ldx [$bp+$i],%o0 ! bp[i] |
bcb43bb3 | 516 | ldx [$ap+$j],%o1 ! ap[0] |
6df8c74d AP |
517 | sllx %o0,32,%g1 |
518 | sllx %o1,32,%g5 | |
519 | srlx %o0,32,%o0 | |
520 | srlx %o1,32,%o1 | |
521 | or %g1,%o0,%o0 | |
522 | or %g5,%o1,%o1 | |
523 | ||
bcb43bb3 AP |
524 | ldx [$tp],%o2 ! tp[0] |
525 | mulx %o1,%o0,%o0 | |
526 | addcc %o2,%o0,%o0 | |
527 | mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 | |
aa2be094 | 528 | stx %o0,[%sp+$bias+$frame+0] |
bcb43bb3 | 529 | |
bcb43bb3 | 530 | ! transfer b[i] to FPU as 4x16-bit values |
6df8c74d AP |
531 | ldda [%o4+2]%asi,$ba |
532 | ldda [%o4+0]%asi,$bb | |
533 | ldda [%o4+6]%asi,$bc | |
534 | ldda [%o4+4]%asi,$bd | |
bcb43bb3 AP |
535 | |
536 | ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values | |
aa2be094 | 537 | ldda [%sp+$bias+$frame+6]%asi,$na |
bcb43bb3 | 538 | fxtod $ba,$ba |
aa2be094 | 539 | ldda [%sp+$bias+$frame+4]%asi,$nb |
bcb43bb3 | 540 | fxtod $bb,$bb |
aa2be094 | 541 | ldda [%sp+$bias+$frame+2]%asi,$nc |
bcb43bb3 | 542 | fxtod $bc,$bc |
aa2be094 | 543 | ldda [%sp+$bias+$frame+0]%asi,$nd |
bcb43bb3 AP |
544 | fxtod $bd,$bd |
545 | ldd [$ap_l+$j],$alo ! load a[j] in double format | |
546 | fxtod $na,$na | |
547 | ldd [$ap_h+$j],$ahi | |
548 | fxtod $nb,$nb | |
549 | ldd [$np_l+$j],$nlo ! load n[j] in double format | |
550 | fxtod $nc,$nc | |
551 | ldd [$np_h+$j],$nhi | |
552 | fxtod $nd,$nd | |
553 | ||
aa2be094 AP |
554 | fmuld $alo,$ba,$aloa |
555 | fmuld $nlo,$na,$nloa | |
556 | fmuld $alo,$bb,$alob | |
557 | fmuld $nlo,$nb,$nlob | |
558 | fmuld $alo,$bc,$aloc | |
aa2be094 | 559 | faddd $aloa,$nloa,$nloa |
6df8c74d | 560 | fmuld $nlo,$nc,$nloc |
aa2be094 | 561 | fmuld $alo,$bd,$alod |
aa2be094 | 562 | faddd $alob,$nlob,$nlob |
6df8c74d | 563 | fmuld $nlo,$nd,$nlod |
aa2be094 | 564 | fmuld $ahi,$ba,$ahia |
aa2be094 | 565 | faddd $aloc,$nloc,$nloc |
6df8c74d | 566 | fmuld $nhi,$na,$nhia |
aa2be094 | 567 | fmuld $ahi,$bb,$ahib |
aa2be094 | 568 | faddd $alod,$nlod,$nlod |
6df8c74d | 569 | fmuld $nhi,$nb,$nhib |
aa2be094 | 570 | fmuld $ahi,$bc,$ahic |
aa2be094 | 571 | faddd $ahia,$nhia,$nhia |
6df8c74d | 572 | fmuld $nhi,$nc,$nhic |
aa2be094 | 573 | fmuld $ahi,$bd,$ahid |
6df8c74d | 574 | faddd $ahib,$nhib,$nhib |
aa2be094 | 575 | fmuld $nhi,$nd,$nhid |
bcb43bb3 | 576 | |
bcb43bb3 AP |
577 | faddd $ahic,$nhic,$dota ! $nhic |
578 | faddd $ahid,$nhid,$dotb ! $nhid | |
579 | ||
580 | faddd $nloc,$nhia,$nloc | |
581 | faddd $nlod,$nhib,$nlod | |
582 | ||
583 | fdtox $nloa,$nloa | |
584 | fdtox $nlob,$nlob | |
585 | fdtox $nloc,$nloc | |
586 | fdtox $nlod,$nlod | |
587 | ||
588 | std $nloa,[%sp+$bias+$frame+0] | |
589 | std $nlob,[%sp+$bias+$frame+8] | |
590 | std $nloc,[%sp+$bias+$frame+16] | |
2e21922e | 591 | add $j,8,$j |
bcb43bb3 | 592 | std $nlod,[%sp+$bias+$frame+24] |
2e21922e AP |
593 | \f |
594 | ldd [$ap_l+$j],$alo ! load a[j] in double format | |
595 | ldd [$ap_h+$j],$ahi | |
596 | ldd [$np_l+$j],$nlo ! load n[j] in double format | |
597 | ldd [$np_h+$j],$nhi | |
598 | ||
599 | fmuld $alo,$ba,$aloa | |
600 | fmuld $nlo,$na,$nloa | |
601 | fmuld $alo,$bb,$alob | |
602 | fmuld $nlo,$nb,$nlob | |
603 | fmuld $alo,$bc,$aloc | |
bcb43bb3 | 604 | ldx [%sp+$bias+$frame+0],%o0 |
2e21922e AP |
605 | faddd $aloa,$nloa,$nloa |
606 | fmuld $nlo,$nc,$nloc | |
bcb43bb3 | 607 | ldx [%sp+$bias+$frame+8],%o1 |
2e21922e | 608 | fmuld $alo,$bd,$alod |
bcb43bb3 | 609 | ldx [%sp+$bias+$frame+16],%o2 |
2e21922e AP |
610 | faddd $alob,$nlob,$nlob |
611 | fmuld $nlo,$nd,$nlod | |
bcb43bb3 | 612 | ldx [%sp+$bias+$frame+24],%o3 |
2e21922e | 613 | fmuld $ahi,$ba,$ahia |
bcb43bb3 AP |
614 | |
615 | srlx %o0,16,%o7 | |
2e21922e AP |
616 | faddd $aloc,$nloc,$nloc |
617 | fmuld $nhi,$na,$nhia | |
bcb43bb3 | 618 | add %o7,%o1,%o1 |
2e21922e | 619 | fmuld $ahi,$bb,$ahib |
bcb43bb3 | 620 | srlx %o1,16,%o7 |
2e21922e AP |
621 | faddd $alod,$nlod,$nlod |
622 | fmuld $nhi,$nb,$nhib | |
bcb43bb3 | 623 | add %o7,%o2,%o2 |
2e21922e | 624 | fmuld $ahi,$bc,$ahic |
bcb43bb3 | 625 | srlx %o2,16,%o7 |
2e21922e AP |
626 | faddd $ahia,$nhia,$nhia |
627 | fmuld $nhi,$nc,$nhic | |
bcb43bb3 AP |
628 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] |
629 | ! why? | |
630 | and %o0,$mask,%o0 | |
2e21922e | 631 | fmuld $ahi,$bd,$ahid |
bcb43bb3 AP |
632 | and %o1,$mask,%o1 |
633 | and %o2,$mask,%o2 | |
2e21922e AP |
634 | faddd $ahib,$nhib,$nhib |
635 | fmuld $nhi,$nd,$nhid | |
bcb43bb3 | 636 | sllx %o1,16,%o1 |
2e21922e | 637 | faddd $dota,$nloa,$nloa |
bcb43bb3 | 638 | sllx %o2,32,%o2 |
2e21922e | 639 | faddd $dotb,$nlob,$nlob |
bcb43bb3 AP |
640 | sllx %o3,48,%o7 |
641 | or %o1,%o0,%o0 | |
2e21922e | 642 | faddd $ahic,$nhic,$dota ! $nhic |
bcb43bb3 | 643 | or %o2,%o0,%o0 |
2e21922e | 644 | faddd $ahid,$nhid,$dotb ! $nhid |
bcb43bb3 AP |
645 | or %o7,%o0,%o0 ! 64-bit result |
646 | ldx [$tp],%o7 | |
2e21922e | 647 | faddd $nloc,$nhia,$nloc |
bcb43bb3 AP |
648 | addcc %o7,%o0,%o0 |
649 | ! end-of-why? | |
2e21922e | 650 | faddd $nlod,$nhib,$nlod |
bcb43bb3 | 651 | srlx %o3,16,%g1 ! 34-bit carry |
2e21922e | 652 | fdtox $nloa,$nloa |
bcb43bb3 AP |
653 | bcs,a %xcc,.+8 |
654 | add %g1,1,%g1 | |
bcb43bb3 | 655 | |
bcb43bb3 AP |
656 | fdtox $nlob,$nlob |
657 | fdtox $nloc,$nloc | |
658 | fdtox $nlod,$nlod | |
659 | ||
660 | std $nloa,[%sp+$bias+$frame+0] | |
661 | std $nlob,[%sp+$bias+$frame+8] | |
2e21922e | 662 | addcc $j,8,$j |
bcb43bb3 | 663 | std $nloc,[%sp+$bias+$frame+16] |
2e21922e | 664 | bz,pn %icc,.Linnerskip |
bcb43bb3 | 665 | std $nlod,[%sp+$bias+$frame+24] |
1c3d2b94 | 666 | \f |
ebae8092 AP |
667 | ba .Linner |
668 | nop | |
669 | .align 32 | |
1c3d2b94 | 670 | .Linner: |
2e21922e AP |
671 | ldd [$ap_l+$j],$alo ! load a[j] in double format |
672 | ldd [$ap_h+$j],$ahi | |
673 | ldd [$np_l+$j],$nlo ! load n[j] in double format | |
674 | ldd [$np_h+$j],$nhi | |
675 | ||
676 | fmuld $alo,$ba,$aloa | |
677 | fmuld $nlo,$na,$nloa | |
678 | fmuld $alo,$bb,$alob | |
679 | fmuld $nlo,$nb,$nlob | |
680 | fmuld $alo,$bc,$aloc | |
bcb43bb3 | 681 | ldx [%sp+$bias+$frame+0],%o0 |
2e21922e AP |
682 | faddd $aloa,$nloa,$nloa |
683 | fmuld $nlo,$nc,$nloc | |
bcb43bb3 | 684 | ldx [%sp+$bias+$frame+8],%o1 |
2e21922e | 685 | fmuld $alo,$bd,$alod |
bcb43bb3 | 686 | ldx [%sp+$bias+$frame+16],%o2 |
2e21922e AP |
687 | faddd $alob,$nlob,$nlob |
688 | fmuld $nlo,$nd,$nlod | |
bcb43bb3 | 689 | ldx [%sp+$bias+$frame+24],%o3 |
2e21922e | 690 | fmuld $ahi,$ba,$ahia |
bcb43bb3 AP |
691 | |
692 | srlx %o0,16,%o7 | |
2e21922e AP |
693 | faddd $aloc,$nloc,$nloc |
694 | fmuld $nhi,$na,$nhia | |
bcb43bb3 | 695 | add %o7,%o1,%o1 |
2e21922e | 696 | fmuld $ahi,$bb,$ahib |
bcb43bb3 | 697 | srlx %o1,16,%o7 |
2e21922e AP |
698 | faddd $alod,$nlod,$nlod |
699 | fmuld $nhi,$nb,$nhib | |
bcb43bb3 | 700 | add %o7,%o2,%o2 |
2e21922e | 701 | fmuld $ahi,$bc,$ahic |
bcb43bb3 | 702 | srlx %o2,16,%o7 |
2e21922e AP |
703 | faddd $ahia,$nhia,$nhia |
704 | fmuld $nhi,$nc,$nhic | |
bcb43bb3 AP |
705 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] |
706 | and %o0,$mask,%o0 | |
2e21922e | 707 | fmuld $ahi,$bd,$ahid |
bcb43bb3 AP |
708 | and %o1,$mask,%o1 |
709 | and %o2,$mask,%o2 | |
2e21922e AP |
710 | faddd $ahib,$nhib,$nhib |
711 | fmuld $nhi,$nd,$nhid | |
bcb43bb3 | 712 | sllx %o1,16,%o1 |
2e21922e | 713 | faddd $dota,$nloa,$nloa |
bcb43bb3 | 714 | sllx %o2,32,%o2 |
2e21922e | 715 | faddd $dotb,$nlob,$nlob |
bcb43bb3 AP |
716 | sllx %o3,48,%o7 |
717 | or %o1,%o0,%o0 | |
2e21922e | 718 | faddd $ahic,$nhic,$dota ! $nhic |
bcb43bb3 | 719 | or %o2,%o0,%o0 |
2e21922e | 720 | faddd $ahid,$nhid,$dotb ! $nhid |
bcb43bb3 | 721 | or %o7,%o0,%o0 ! 64-bit result |
2e21922e | 722 | faddd $nloc,$nhia,$nloc |
bcb43bb3 | 723 | addcc %g1,%o0,%o0 |
ebae8092 | 724 | ldx [$tp+8],%o7 ! tp[j] |
2e21922e | 725 | faddd $nlod,$nhib,$nlod |
bcb43bb3 | 726 | srlx %o3,16,%g1 ! 34-bit carry |
2e21922e | 727 | fdtox $nloa,$nloa |
bcb43bb3 AP |
728 | bcs,a %xcc,.+8 |
729 | add %g1,1,%g1 | |
2e21922e | 730 | fdtox $nlob,$nlob |
bcb43bb3 | 731 | addcc %o7,%o0,%o0 |
2e21922e | 732 | fdtox $nloc,$nloc |
bcb43bb3 AP |
733 | bcs,a %xcc,.+8 |
734 | add %g1,1,%g1 | |
735 | ||
736 | stx %o0,[$tp] ! tp[j-1] | |
2e21922e | 737 | fdtox $nlod,$nlod |
1c3d2b94 AP |
738 | |
739 | std $nloa,[%sp+$bias+$frame+0] | |
740 | std $nlob,[%sp+$bias+$frame+8] | |
741 | std $nloc,[%sp+$bias+$frame+16] | |
aa2be094 | 742 | addcc $j,8,$j |
2e21922e | 743 | std $nlod,[%sp+$bias+$frame+24] |
aa2be094 | 744 | bnz,pt %icc,.Linner |
bcb43bb3 | 745 | add $tp,8,$tp |
1c3d2b94 AP |
746 | \f |
747 | .Linnerskip: | |
2e21922e AP |
748 | fdtox $dota,$dota |
749 | fdtox $dotb,$dotb | |
750 | ||
1c3d2b94 AP |
751 | ldx [%sp+$bias+$frame+0],%o0 |
752 | ldx [%sp+$bias+$frame+8],%o1 | |
753 | ldx [%sp+$bias+$frame+16],%o2 | |
754 | ldx [%sp+$bias+$frame+24],%o3 | |
755 | ||
756 | srlx %o0,16,%o7 | |
2e21922e | 757 | std $dota,[%sp+$bias+$frame+32] |
1c3d2b94 | 758 | add %o7,%o1,%o1 |
2e21922e | 759 | std $dotb,[%sp+$bias+$frame+40] |
1c3d2b94 AP |
760 | srlx %o1,16,%o7 |
761 | add %o7,%o2,%o2 | |
762 | srlx %o2,16,%o7 | |
763 | add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | |
764 | and %o0,$mask,%o0 | |
765 | and %o1,$mask,%o1 | |
766 | and %o2,$mask,%o2 | |
767 | sllx %o1,16,%o1 | |
768 | sllx %o2,32,%o2 | |
769 | sllx %o3,48,%o7 | |
770 | or %o1,%o0,%o0 | |
771 | or %o2,%o0,%o0 | |
2e21922e | 772 | ldx [%sp+$bias+$frame+32],%o4 |
1c3d2b94 | 773 | or %o7,%o0,%o0 ! 64-bit result |
2e21922e | 774 | ldx [%sp+$bias+$frame+40],%o5 |
1c3d2b94 | 775 | addcc %g1,%o0,%o0 |
2e21922e | 776 | ldx [$tp+8],%o7 ! tp[j] |
1c3d2b94 AP |
777 | srlx %o3,16,%g1 ! 34-bit carry |
778 | bcs,a %xcc,.+8 | |
779 | add %g1,1,%g1 | |
780 | ||
1c3d2b94 AP |
781 | addcc %o7,%o0,%o0 |
782 | bcs,a %xcc,.+8 | |
783 | add %g1,1,%g1 | |
784 | ||
785 | stx %o0,[$tp] ! tp[j-1] | |
786 | add $tp,8,$tp | |
bcb43bb3 | 787 | |
2e21922e AP |
788 | srlx %o4,16,%o7 |
789 | add %o7,%o5,%o5 | |
790 | and %o4,$mask,%o4 | |
791 | sllx %o5,16,%o7 | |
792 | or %o7,%o4,%o4 | |
793 | addcc %g1,%o4,%o4 | |
794 | srlx %o5,48,%g1 | |
bcb43bb3 AP |
795 | bcs,a %xcc,.+8 |
796 | add %g1,1,%g1 | |
797 | ||
2e21922e AP |
798 | addcc $carry,%o4,%o4 |
799 | stx %o4,[$tp] ! tp[num-1] | |
bcb43bb3 AP |
800 | mov %g1,$carry |
801 | bcs,a %xcc,.+8 | |
802 | add $carry,1,$carry | |
803 | ||
aa2be094 AP |
804 | addcc $i,8,$i |
805 | bnz %icc,.Louter | |
bcb43bb3 AP |
806 | nop |
807 | \f | |
aa2be094 | 808 | sub %g0,$num,%o7 ! n=-num |
bcb43bb3 AP |
809 | cmp $carry,0 ! clears %icc.c |
810 | bne,pn %icc,.Lsub | |
aa2be094 | 811 | add $tp,8,$tp ! adjust tp to point at the end |
bcb43bb3 AP |
812 | |
813 | ld [$tp-8],%o0 | |
6df8c74d | 814 | ld [$np-4],%o1 |
aa2be094 | 815 | cmp %o0,%o1 ! compare topmost words |
bcb43bb3 AP |
816 | bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken |
817 | nop | |
818 | ||
819 | .align 32,0x1000000 | |
820 | .Lsub: | |
aa2be094 AP |
821 | ldd [$tp+%o7],%o0 |
822 | ldd [$np+%o7],%o2 | |
aa2be094 AP |
823 | subccc %o1,%o2,%o2 |
824 | subccc %o0,%o3,%o3 | |
aa2be094 AP |
825 | std %o2,[$rp+%o7] |
826 | add %o7,8,%o7 | |
827 | brnz,pt %o7,.Lsub | |
bcb43bb3 AP |
828 | nop |
829 | subccc $carry,0,$carry | |
aa2be094 | 830 | bcc,pt %icc,.Lzap |
6df8c74d | 831 | sub %g0,$num,%o7 ! n=-num |
bcb43bb3 AP |
832 | |
833 | .align 16,0x1000000 | |
834 | .Lcopy: | |
aa2be094 | 835 | ldx [$tp+%o7],%o0 |
aa2be094 AP |
836 | srlx %o0,32,%o1 |
837 | std %o0,[$rp+%o7] | |
aa2be094 AP |
838 | add %o7,8,%o7 |
839 | brnz,pt %o7,.Lcopy | |
bcb43bb3 AP |
840 | nop |
841 | ba .Lzap | |
6df8c74d | 842 | sub %g0,$num,%o7 ! n=-num |
bcb43bb3 AP |
843 | |
844 | .align 32 | |
845 | .Lzap: | |
aa2be094 AP |
846 | stx %g0,[$tp+%o7] |
847 | stx %g0,[$ap_l+%o7] | |
848 | stx %g0,[$ap_h+%o7] | |
849 | stx %g0,[$np_l+%o7] | |
850 | stx %g0,[$np_h+%o7] | |
851 | add %o7,8,%o7 | |
852 | brnz,pt %o7,.Lzap | |
bcb43bb3 AP |
853 | nop |
854 | ||
855 | ldx [%sp+$bias+$frame+48],%o7 | |
856 | wr %g0,%o7,%asi ! restore %asi | |
857 | ||
858 | mov 1,%i0 | |
aa2be094 | 859 | .Lret: |
bcb43bb3 AP |
860 | ret |
861 | restore | |
862 | .type $fname,#function | |
863 | .size $fname,(.-$fname) | |
864 | ___ | |
865 | ||
866 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
3b4a0225 AP |
867 | |
868 | # Below substitution makes it possible to compile without demanding | |
869 | # VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I | |
870 | # dare to do this, because VIS capability is detected at run-time now | |
871 | # and this routine is not called on CPU not capable to execute it. Do | |
872 | # note that fzeros is not the only VIS dependency! Another dependency | |
873 | # is implicit and is just _a_ numerical value loaded to %asi register, | |
874 | # which assembler can't recognize as VIS specific... | |
875 | $code =~ s/fzeros\s+%f([0-9]+)/ | |
876 | sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) | |
877 | /gem; | |
878 | ||
bcb43bb3 | 879 | print $code; |
3b4a0225 | 880 | # flush |
bcb43bb3 | 881 | close STDOUT; |