]>
Commit | Line | Data |
---|---|---|
68ea6068 AP |
1 | #!/usr/bin/env perl |
2 | ||
3 | # ==================================================================== | |
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | # project. Rights for redistribution and usage in source and binary | |
6 | # forms are granted according to the OpenSSL license. | |
7 | # ==================================================================== | |
8 | ||
9 | # December 2005 | |
10 | # | |
11 | # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons | |
12 | # for undertaken effort are multiple. First of all, UltraSPARC is not | |
13 | # the whole SPARCv9 universe and other VIS-free implementations deserve | |
14 | # optimized code as much. Secondly, newly introduced UltraSPARC T1, | |
15 | # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, | |
16 | # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with | |
17 | # several integrated RSA/DSA accelerator circuits accessible through | |
18 | # kernel driver [only(*)], but having decent user-land software | |
19 | # implementation is important too. Finally, reasons like desire to | |
20 | # experiment with dedicated squaring procedure. Yes, this module | |
21 | # implements one, because it was easiest to draft it in SPARCv9 | |
22 | # instructions... | |
23 | ||
24 | # (*) Engine accessing the driver in question is on my TODO list. | |
25 | # For reference, acceleator is estimated to give 6 to 10 times | |
26 | # improvement on single-threaded RSA sign. It should be noted | |
27 | # that 6-10x improvement coefficient does not actually mean | |
28 | # something extraordinary in terms of absolute [single-threaded] | |
29 | # performance, as SPARCv9 instruction set is by all means least | |
30 | # suitable for high performance crypto among other 64 bit | |
31 | # platforms. 6-10x factor simply places T1 in same performance | |
32 | # domain as say AMD64 and IA-64. Improvement of RSA verify don't | |
33 | # appear impressive at all, but it's the sign operation which is | |
34 | # far more critical/interesting. | |
35 | ||
36 | # You might notice that inner loops are modulo-scheduled:-) This has | |
37 | # essentially negligible impact on UltraSPARC performance, it's | |
38 | # Fujitsu SPARC64 V users who should notice and hopefully appreciate | |
39 | # the advantage... Currently this module surpasses sparcv9a-mont.pl | |
40 | # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a | |
41 | # module still have hidden potential [see TODO list there], which is | |
42 | # estimated to be larger than 20%... | |
43 | ||
44 | # int bn_mul_mont( | |
45 | $rp="%i0"; # BN_ULONG *rp, | |
46 | $ap="%i1"; # const BN_ULONG *ap, | |
47 | $bp="%i2"; # const BN_ULONG *bp, | |
48 | $np="%i3"; # const BN_ULONG *np, | |
49 | $n0="%i4"; # const BN_ULONG *n0, | |
50 | $num="%i5"; # int num); | |
51 | ||
52 | $bits=32; | |
53 | for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | |
54 | if ($bits==64) { $bias=2047; $frame=192; } | |
55 | else { $bias=0; $frame=128; } | |
56 | ||
57 | $car0="%o0"; | |
58 | $car1="%o1"; | |
59 | $car2="%o2"; # 1 bit | |
60 | $acc0="%o3"; | |
61 | $acc1="%o4"; | |
62 | $mask="%g1"; # 32 bits, what a waste... | |
63 | $tmp0="%g4"; | |
64 | $tmp1="%g5"; | |
65 | ||
66 | $i="%l0"; | |
67 | $j="%l1"; | |
68 | $mul0="%l2"; | |
69 | $mul1="%l3"; | |
70 | $tp="%l4"; | |
71 | $apj="%l5"; | |
72 | $npj="%l6"; | |
73 | $tpj="%l7"; | |
74 | ||
a00e414f | 75 | $fname="bn_mul_mont_int"; |
68ea6068 AP |
76 | |
77 | $code=<<___; | |
78 | .section ".text",#alloc,#execinstr | |
79 | ||
80 | .global $fname | |
81 | .align 32 | |
82 | $fname: | |
83 | cmp %o5,4 ! 128 bits minimum | |
84 | bge,pt %icc,.Lenter | |
85 | sethi %hi(0xffffffff),$mask | |
86 | retl | |
87 | clr %o0 | |
88 | .align 32 | |
89 | .Lenter: | |
90 | save %sp,-$frame,%sp | |
91 | sll $num,2,$num ! num*=4 | |
92 | or $mask,%lo(0xffffffff),$mask | |
93 | ld [$n0],$n0 | |
94 | cmp $ap,$bp | |
95 | and $num,$mask,$num | |
96 | ld [$bp],$mul0 ! bp[0] | |
97 | be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont | |
98 | nop | |
99 | ||
100 | add %sp,$bias,%o7 ! real top of stack | |
101 | ld [$ap],$car0 ! ap[0] | |
102 | sub %o7,$num,%o7 | |
103 | ld [$ap+4],$apj ! ap[1] | |
104 | and %o7,-1024,%o7 | |
105 | ld [$np],$car1 ! np[0] | |
106 | sub %o7,$bias,%sp ! alloca | |
107 | ld [$np+4],$npj ! np[1] | |
108 | mov 12,$j | |
109 | ||
110 | mulx $car0,$mul0,$car0 ! ap[0]*bp[0] | |
111 | mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] | |
112 | and $car0,$mask,$acc0 | |
113 | add %sp,$bias+$frame,$tp | |
114 | ld [$ap+8],$apj !prologue! | |
115 | ||
116 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0 | |
117 | and $mul1,$mask,$mul1 | |
118 | ||
119 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 | |
120 | mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 | |
121 | srlx $car0,32,$car0 | |
122 | add $acc0,$car1,$car1 | |
123 | ld [$np+8],$npj !prologue! | |
124 | srlx $car1,32,$car1 | |
125 | mov $tmp0,$acc0 !prologue! | |
126 | ||
127 | .L1st: | |
128 | mulx $apj,$mul0,$tmp0 | |
129 | mulx $npj,$mul1,$tmp1 | |
130 | add $acc0,$car0,$car0 | |
131 | ld [$ap+$j],$apj ! ap[j] | |
132 | and $car0,$mask,$acc0 | |
133 | add $acc1,$car1,$car1 | |
134 | ld [$np+$j],$npj ! np[j] | |
135 | srlx $car0,32,$car0 | |
136 | add $acc0,$car1,$car1 | |
137 | add $j,4,$j ! j++ | |
138 | mov $tmp0,$acc0 | |
139 | st $car1,[$tp] | |
140 | cmp $j,$num | |
141 | mov $tmp1,$acc1 | |
142 | srlx $car1,32,$car1 | |
143 | bl %icc,.L1st | |
144 | add $tp,4,$tp ! tp++ | |
145 | !.L1st | |
146 | ||
147 | mulx $apj,$mul0,$tmp0 !epilogue! | |
148 | mulx $npj,$mul1,$tmp1 | |
149 | add $acc0,$car0,$car0 | |
150 | and $car0,$mask,$acc0 | |
151 | add $acc1,$car1,$car1 | |
152 | srlx $car0,32,$car0 | |
153 | add $acc0,$car1,$car1 | |
154 | st $car1,[$tp] | |
155 | srlx $car1,32,$car1 | |
156 | ||
157 | add $tmp0,$car0,$car0 | |
158 | and $car0,$mask,$acc0 | |
159 | add $tmp1,$car1,$car1 | |
160 | srlx $car0,32,$car0 | |
161 | add $acc0,$car1,$car1 | |
162 | st $car1,[$tp+4] | |
163 | srlx $car1,32,$car1 | |
164 | ||
165 | add $car0,$car1,$car1 | |
166 | st $car1,[$tp+8] | |
167 | srlx $car1,32,$car2 | |
168 | \f | |
169 | mov 4,$i ! i++ | |
170 | ld [$bp+4],$mul0 ! bp[1] | |
171 | .Louter: | |
172 | add %sp,$bias+$frame,$tp | |
173 | ld [$ap],$car0 ! ap[0] | |
174 | ld [$ap+4],$apj ! ap[1] | |
175 | ld [$np],$car1 ! np[0] | |
176 | ld [$np+4],$npj ! np[1] | |
177 | ld [$tp],$tmp1 ! tp[0] | |
178 | ld [$tp+4],$tpj ! tp[1] | |
179 | mov 12,$j | |
180 | ||
181 | mulx $car0,$mul0,$car0 | |
182 | mulx $apj,$mul0,$tmp0 !prologue! | |
183 | add $tmp1,$car0,$car0 | |
184 | ld [$ap+8],$apj !prologue! | |
185 | and $car0,$mask,$acc0 | |
186 | ||
187 | mulx $n0,$acc0,$mul1 | |
188 | and $mul1,$mask,$mul1 | |
189 | ||
190 | mulx $car1,$mul1,$car1 | |
191 | mulx $npj,$mul1,$acc1 !prologue! | |
192 | srlx $car0,32,$car0 | |
193 | add $acc0,$car1,$car1 | |
194 | ld [$np+8],$npj !prologue! | |
195 | srlx $car1,32,$car1 | |
196 | mov $tmp0,$acc0 !prologue! | |
197 | ||
198 | .Linner: | |
199 | mulx $apj,$mul0,$tmp0 | |
200 | mulx $npj,$mul1,$tmp1 | |
201 | add $tpj,$car0,$car0 | |
202 | ld [$ap+$j],$apj ! ap[j] | |
203 | add $acc0,$car0,$car0 | |
204 | add $acc1,$car1,$car1 | |
205 | ld [$np+$j],$npj ! np[j] | |
206 | and $car0,$mask,$acc0 | |
207 | ld [$tp+8],$tpj ! tp[j] | |
208 | srlx $car0,32,$car0 | |
209 | add $acc0,$car1,$car1 | |
210 | add $j,4,$j ! j++ | |
211 | mov $tmp0,$acc0 | |
212 | st $car1,[$tp] ! tp[j-1] | |
213 | srlx $car1,32,$car1 | |
214 | mov $tmp1,$acc1 | |
215 | cmp $j,$num | |
216 | bl %icc,.Linner | |
217 | add $tp,4,$tp ! tp++ | |
218 | !.Linner | |
219 | ||
220 | mulx $apj,$mul0,$tmp0 !epilogue! | |
221 | mulx $npj,$mul1,$tmp1 | |
222 | add $tpj,$car0,$car0 | |
223 | add $acc0,$car0,$car0 | |
224 | ld [$tp+8],$tpj ! tp[j] | |
225 | and $car0,$mask,$acc0 | |
226 | add $acc1,$car1,$car1 | |
227 | srlx $car0,32,$car0 | |
228 | add $acc0,$car1,$car1 | |
229 | st $car1,[$tp] ! tp[j-1] | |
230 | srlx $car1,32,$car1 | |
231 | ||
232 | add $tpj,$car0,$car0 | |
233 | add $tmp0,$car0,$car0 | |
234 | and $car0,$mask,$acc0 | |
235 | add $tmp1,$car1,$car1 | |
236 | add $acc0,$car1,$car1 | |
237 | st $car1,[$tp+4] ! tp[j-1] | |
238 | srlx $car0,32,$car0 | |
239 | add $i,4,$i ! i++ | |
240 | srlx $car1,32,$car1 | |
241 | ||
242 | add $car0,$car1,$car1 | |
243 | cmp $i,$num | |
244 | add $car2,$car1,$car1 | |
245 | st $car1,[$tp+8] | |
246 | ||
247 | srlx $car1,32,$car2 | |
248 | bl,a %icc,.Louter | |
249 | ld [$bp+$i],$mul0 ! bp[i] | |
250 | !.Louter | |
251 | ||
252 | add $tp,12,$tp | |
253 | \f | |
254 | .Ltail: | |
255 | add $np,$num,$np | |
256 | add $rp,$num,$rp | |
257 | ||
258 | cmp $car2,0 ! clears %icc.c | |
259 | bne,pn %icc,.Lsub | |
260 | sub %g0,$num,%o7 ! k=-num | |
261 | ||
262 | cmp $car1,$npj ! compare top-most $tp and $np words | |
263 | bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken | |
264 | nop | |
265 | ||
266 | .align 16,0x1000000 | |
267 | .Lsub: | |
268 | ld [$tp+%o7],%o0 | |
269 | ld [$np+%o7],%o1 | |
270 | subccc %o0,%o1,%o1 | |
271 | st %o1,[$rp+%o7] | |
272 | add %o7,4,%o7 | |
273 | brnz %o7,.Lsub | |
274 | nop | |
275 | subccc $car2,0,$car2 | |
276 | bcc %icc,.Lzap | |
277 | sub %g0,$num,%o7 | |
278 | ||
279 | .align 16,0x1000000 | |
280 | .Lcopy: | |
281 | ld [$tp+%o7],%o0 | |
282 | st %o0,[$rp+%o7] | |
283 | add %o7,4,%o7 | |
284 | brnz %o7,.Lcopy | |
285 | nop | |
286 | ba .Lzap | |
287 | sub %g0,$num,%o7 | |
288 | ||
289 | .align 32 | |
290 | .Lzap: | |
291 | st %g0,[$tp+%o7] | |
292 | add %o7,4,%o7 | |
293 | brnz %o7,.Lzap | |
294 | nop | |
295 | mov 1,%i0 | |
296 | ret | |
297 | restore | |
298 | ___ | |
299 | \f | |
300 | ######## | |
a00e414f AP |
301 | ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over |
302 | ######## code without following dedicated squaring procedure. | |
68ea6068 AP |
303 | ######## |
304 | $sbit="%i2"; # re-use $bp! | |
305 | ||
306 | $code.=<<___; | |
307 | .align 32 | |
308 | .Lbn_sqr_mont: | |
309 | add %sp,$bias,%o7 ! real top of stack | |
310 | ld [$ap+4],$apj ! ap[1] | |
311 | sub %o7,$num,%o7 | |
312 | ld [$np],$car1 ! np[0] | |
313 | and %o7,-1024,%o7 | |
314 | ld [$np+4],$npj ! np[1] | |
315 | sub %o7,$bias,%sp ! alloca | |
316 | mov 12,$j | |
317 | ||
318 | mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] | |
319 | mulx $apj,$mul0,$tmp0 !prologue! | |
320 | and $car0,$mask,$acc0 | |
321 | add %sp,$bias+$frame,$tp | |
322 | ld [$ap+8],$apj !prologue! | |
323 | ||
324 | mulx $n0,$acc0,$mul1 ! "t[0]"*n0 | |
325 | srlx $car0,32,$car0 | |
326 | and $mul1,$mask,$mul1 | |
327 | ||
328 | mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 | |
329 | mulx $npj,$mul1,$acc1 !prologue! | |
330 | and $car0,1,$sbit | |
331 | ld [$np+8],$npj !prologue! | |
332 | srlx $car0,1,$car0 | |
333 | add $acc0,$car1,$car1 | |
334 | srlx $car1,32,$car1 | |
335 | mov $tmp0,$acc0 !prologue! | |
336 | ||
337 | .Lsqr_1st: | |
338 | mulx $apj,$mul0,$tmp0 | |
339 | mulx $npj,$mul1,$tmp1 | |
340 | add $acc0,$car0,$car0 ! ap[j]*a0+c0 | |
341 | add $acc1,$car1,$car1 | |
342 | ld [$ap+$j],$apj ! ap[j] | |
343 | and $car0,$mask,$acc0 | |
344 | ld [$np+$j],$npj ! np[j] | |
345 | srlx $car0,32,$car0 | |
346 | add $acc0,$acc0,$acc0 | |
347 | or $sbit,$acc0,$acc0 | |
348 | mov $tmp1,$acc1 | |
349 | srlx $acc0,32,$sbit | |
350 | add $j,4,$j ! j++ | |
351 | and $acc0,$mask,$acc0 | |
352 | cmp $j,$num | |
353 | add $acc0,$car1,$car1 | |
354 | st $car1,[$tp] | |
355 | mov $tmp0,$acc0 | |
356 | srlx $car1,32,$car1 | |
357 | bl %icc,.Lsqr_1st | |
358 | add $tp,4,$tp ! tp++ | |
359 | !.Lsqr_1st | |
360 | ||
361 | mulx $apj,$mul0,$tmp0 ! epilogue | |
362 | mulx $npj,$mul1,$tmp1 | |
363 | add $acc0,$car0,$car0 ! ap[j]*a0+c0 | |
364 | add $acc1,$car1,$car1 | |
365 | and $car0,$mask,$acc0 | |
366 | srlx $car0,32,$car0 | |
367 | add $acc0,$acc0,$acc0 | |
368 | or $sbit,$acc0,$acc0 | |
369 | srlx $acc0,32,$sbit | |
370 | and $acc0,$mask,$acc0 | |
371 | add $acc0,$car1,$car1 | |
372 | st $car1,[$tp] | |
373 | srlx $car1,32,$car1 | |
374 | ||
375 | add $tmp0,$car0,$car0 ! ap[j]*a0+c0 | |
376 | add $tmp1,$car1,$car1 | |
377 | and $car0,$mask,$acc0 | |
378 | srlx $car0,32,$car0 | |
379 | add $acc0,$acc0,$acc0 | |
380 | or $sbit,$acc0,$acc0 | |
381 | srlx $acc0,32,$sbit | |
382 | and $acc0,$mask,$acc0 | |
383 | add $acc0,$car1,$car1 | |
384 | st $car1,[$tp+4] | |
385 | srlx $car1,32,$car1 | |
386 | ||
387 | add $car0,$car0,$car0 | |
388 | or $sbit,$car0,$car0 | |
389 | add $car0,$car1,$car1 | |
390 | st $car1,[$tp+8] | |
391 | srlx $car1,32,$car2 | |
392 | \f | |
393 | ld [%sp+$bias+$frame],$tmp0 ! tp[0] | |
394 | ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] | |
395 | ld [%sp+$bias+$frame+8],$tpj ! tp[2] | |
396 | ld [$ap+4],$mul0 ! ap[1] | |
397 | ld [$ap+8],$apj ! ap[2] | |
398 | ld [$np],$car1 ! np[0] | |
399 | ld [$np+4],$npj ! np[1] | |
400 | mulx $n0,$tmp0,$mul1 | |
401 | ||
402 | mulx $mul0,$mul0,$car0 | |
403 | and $mul1,$mask,$mul1 | |
404 | ||
405 | mulx $car1,$mul1,$car1 | |
406 | mulx $npj,$mul1,$acc1 | |
407 | add $tmp0,$car1,$car1 | |
408 | and $car0,$mask,$acc0 | |
409 | ld [$np+8],$npj ! np[2] | |
410 | srlx $car1,32,$car1 | |
411 | add $tmp1,$car1,$car1 | |
412 | srlx $car0,32,$car0 | |
413 | add $acc0,$car1,$car1 | |
414 | and $car0,1,$sbit | |
415 | add $acc1,$car1,$car1 | |
416 | srlx $car0,1,$car0 | |
417 | mov 12,$j | |
418 | st $car1,[%sp+$bias+$frame] ! tp[0]= | |
419 | srlx $car1,32,$car1 | |
420 | add %sp,$bias+$frame+4,$tp | |
421 | ||
422 | .Lsqr_2nd: | |
423 | mulx $apj,$mul0,$acc0 | |
424 | mulx $npj,$mul1,$acc1 | |
425 | add $acc0,$car0,$car0 | |
426 | add $tpj,$car1,$car1 | |
427 | ld [$ap+$j],$apj ! ap[j] | |
428 | and $car0,$mask,$acc0 | |
429 | ld [$np+$j],$npj ! np[j] | |
430 | srlx $car0,32,$car0 | |
431 | add $acc1,$car1,$car1 | |
432 | ld [$tp+8],$tpj ! tp[j] | |
433 | add $acc0,$acc0,$acc0 | |
434 | add $j,4,$j ! j++ | |
435 | or $sbit,$acc0,$acc0 | |
436 | srlx $acc0,32,$sbit | |
437 | and $acc0,$mask,$acc0 | |
438 | cmp $j,$num | |
439 | add $acc0,$car1,$car1 | |
440 | st $car1,[$tp] ! tp[j-1] | |
441 | srlx $car1,32,$car1 | |
442 | bl %icc,.Lsqr_2nd | |
443 | add $tp,4,$tp ! tp++ | |
444 | !.Lsqr_2nd | |
445 | ||
446 | mulx $apj,$mul0,$acc0 | |
447 | mulx $npj,$mul1,$acc1 | |
448 | add $acc0,$car0,$car0 | |
449 | add $tpj,$car1,$car1 | |
450 | and $car0,$mask,$acc0 | |
451 | srlx $car0,32,$car0 | |
452 | add $acc1,$car1,$car1 | |
453 | add $acc0,$acc0,$acc0 | |
454 | or $sbit,$acc0,$acc0 | |
455 | srlx $acc0,32,$sbit | |
456 | and $acc0,$mask,$acc0 | |
457 | add $acc0,$car1,$car1 | |
458 | st $car1,[$tp] ! tp[j-1] | |
459 | srlx $car1,32,$car1 | |
460 | ||
461 | add $car0,$car0,$car0 | |
462 | or $sbit,$car0,$car0 | |
463 | add $car0,$car1,$car1 | |
464 | add $car2,$car1,$car1 | |
465 | st $car1,[$tp+4] | |
466 | srlx $car1,32,$car2 | |
467 | \f | |
468 | ld [%sp+$bias+$frame],$tmp1 ! tp[0] | |
469 | ld [%sp+$bias+$frame+4],$tpj ! tp[1] | |
470 | ld [$ap+8],$mul0 ! ap[2] | |
471 | ld [$np],$car1 ! np[0] | |
472 | ld [$np+4],$npj ! np[1] | |
473 | mulx $n0,$tmp1,$mul1 | |
474 | and $mul1,$mask,$mul1 | |
475 | mov 8,$i | |
476 | ||
477 | mulx $mul0,$mul0,$car0 | |
478 | mulx $car1,$mul1,$car1 | |
479 | and $car0,$mask,$acc0 | |
480 | add $tmp1,$car1,$car1 | |
481 | srlx $car0,32,$car0 | |
482 | add %sp,$bias+$frame,$tp | |
483 | srlx $car1,32,$car1 | |
484 | and $car0,1,$sbit | |
485 | srlx $car0,1,$car0 | |
486 | mov 4,$j | |
487 | ||
488 | .Lsqr_outer: | |
489 | .Lsqr_inner1: | |
490 | mulx $npj,$mul1,$acc1 | |
491 | add $tpj,$car1,$car1 | |
492 | add $j,4,$j | |
493 | ld [$tp+8],$tpj | |
494 | cmp $j,$i | |
495 | add $acc1,$car1,$car1 | |
496 | ld [$np+$j],$npj | |
497 | st $car1,[$tp] | |
498 | srlx $car1,32,$car1 | |
499 | bl %icc,.Lsqr_inner1 | |
500 | add $tp,4,$tp | |
501 | !.Lsqr_inner1 | |
502 | ||
503 | add $j,4,$j | |
504 | ld [$ap+$j],$apj ! ap[j] | |
505 | mulx $npj,$mul1,$acc1 | |
506 | add $tpj,$car1,$car1 | |
507 | ld [$np+$j],$npj ! np[j] | |
508 | add $acc0,$car1,$car1 | |
509 | ld [$tp+8],$tpj ! tp[j] | |
510 | add $acc1,$car1,$car1 | |
511 | st $car1,[$tp] | |
512 | srlx $car1,32,$car1 | |
513 | ||
514 | add $j,4,$j | |
515 | cmp $j,$num | |
516 | be,pn %icc,.Lsqr_no_inner2 | |
517 | add $tp,4,$tp | |
518 | ||
519 | .Lsqr_inner2: | |
520 | mulx $apj,$mul0,$acc0 | |
521 | mulx $npj,$mul1,$acc1 | |
522 | add $tpj,$car1,$car1 | |
523 | add $acc0,$car0,$car0 | |
524 | ld [$ap+$j],$apj ! ap[j] | |
525 | and $car0,$mask,$acc0 | |
526 | ld [$np+$j],$npj ! np[j] | |
527 | srlx $car0,32,$car0 | |
528 | add $acc0,$acc0,$acc0 | |
529 | ld [$tp+8],$tpj ! tp[j] | |
530 | or $sbit,$acc0,$acc0 | |
531 | add $j,4,$j ! j++ | |
532 | srlx $acc0,32,$sbit | |
533 | and $acc0,$mask,$acc0 | |
534 | cmp $j,$num | |
535 | add $acc0,$car1,$car1 | |
536 | add $acc1,$car1,$car1 | |
537 | st $car1,[$tp] ! tp[j-1] | |
538 | srlx $car1,32,$car1 | |
539 | bl %icc,.Lsqr_inner2 | |
540 | add $tp,4,$tp ! tp++ | |
541 | ||
542 | .Lsqr_no_inner2: | |
543 | mulx $apj,$mul0,$acc0 | |
544 | mulx $npj,$mul1,$acc1 | |
545 | add $tpj,$car1,$car1 | |
546 | add $acc0,$car0,$car0 | |
547 | and $car0,$mask,$acc0 | |
548 | srlx $car0,32,$car0 | |
549 | add $acc0,$acc0,$acc0 | |
550 | or $sbit,$acc0,$acc0 | |
551 | srlx $acc0,32,$sbit | |
552 | and $acc0,$mask,$acc0 | |
553 | add $acc0,$car1,$car1 | |
554 | add $acc1,$car1,$car1 | |
555 | st $car1,[$tp] ! tp[j-1] | |
556 | srlx $car1,32,$car1 | |
557 | ||
558 | add $car0,$car0,$car0 | |
559 | or $sbit,$car0,$car0 | |
560 | add $car0,$car1,$car1 | |
561 | add $car2,$car1,$car1 | |
562 | st $car1,[$tp+4] | |
563 | srlx $car1,32,$car2 | |
564 | \f | |
565 | add $i,4,$i ! i++ | |
566 | ld [%sp+$bias+$frame],$tmp1 ! tp[0] | |
567 | ld [%sp+$bias+$frame+4],$tpj ! tp[1] | |
568 | ld [$ap+$i],$mul0 ! ap[j] | |
569 | ld [$np],$car1 ! np[0] | |
570 | ld [$np+4],$npj ! np[1] | |
571 | mulx $n0,$tmp1,$mul1 | |
572 | and $mul1,$mask,$mul1 | |
573 | add $i,4,$tmp0 | |
574 | ||
575 | mulx $mul0,$mul0,$car0 | |
576 | mulx $car1,$mul1,$car1 | |
577 | and $car0,$mask,$acc0 | |
578 | add $tmp1,$car1,$car1 | |
579 | srlx $car0,32,$car0 | |
580 | add %sp,$bias+$frame,$tp | |
581 | srlx $car1,32,$car1 | |
582 | and $car0,1,$sbit | |
583 | srlx $car0,1,$car0 | |
584 | ||
585 | cmp $tmp0,$num ! i<num-1 | |
586 | bl %icc,.Lsqr_outer | |
587 | mov 4,$j | |
588 | \f | |
589 | .Lsqr_last: | |
590 | mulx $npj,$mul1,$acc1 | |
591 | add $tpj,$car1,$car1 | |
592 | add $j,4,$j | |
593 | ld [$tp+8],$tpj | |
594 | cmp $j,$i | |
595 | add $acc1,$car1,$car1 | |
596 | ld [$np+$j],$npj | |
597 | st $car1,[$tp] | |
598 | srlx $car1,32,$car1 | |
599 | bl %icc,.Lsqr_last | |
600 | add $tp,4,$tp | |
601 | !.Lsqr_last | |
602 | ||
603 | mulx $npj,$mul1,$acc1 | |
604 | add $tpj,$car1,$car1 | |
605 | add $acc0,$car1,$car1 | |
606 | add $acc1,$car1,$car1 | |
607 | st $car1,[$tp] | |
608 | srlx $car1,32,$car1 | |
609 | ||
610 | add $car0,$car0,$car0 ! recover $car0 | |
611 | or $sbit,$car0,$car0 | |
612 | add $car0,$car1,$car1 | |
613 | add $car2,$car1,$car1 | |
614 | st $car1,[$tp+4] | |
615 | srlx $car1,32,$car2 | |
616 | ||
617 | ba .Ltail | |
618 | add $tp,8,$tp | |
619 | .type $fname,#function | |
620 | .size $fname,(.-$fname) | |
621 | ___ | |
622 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
623 | print $code; | |
624 | close STDOUT; |