]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/sparcv8plus.S
misspellings fixes by https://github.com/vlajos/misspell_fixer
[thirdparty/openssl.git] / crypto / bn / asm / sparcv8plus.S
1 .ident "sparcv8plus.s, Version 1.4"
2 .ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
3
4 /*
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 * ====================================================================
13 */
14
15 /*
16 * This is my modest contributon to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
20 *
21 * Questions-n-answers.
22 *
23 * Q. How to compile?
24 * A. With SC4.x/SC5.x:
25 *
26 * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
27 *
28 * and with gcc:
29 *
30 * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
31 *
32 * or if above fails (it does if you have gas installed):
33 *
34 * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
35 *
36 * Quick-n-dirty way to fuse the module into the library.
37 * Provided that the library is already configured and built
38 * (in 0.9.2 case with no-asm option):
39 *
40 * # cd crypto/bn
41 * # cp /some/place/bn_asm.sparc.v8plus.S .
42 * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
43 * # make
44 * # cd ../..
45 * # make; make test
46 *
47 * Quick-n-dirty way to get rid of it:
48 *
49 * # cd crypto/bn
50 * # touch bn_asm.c
51 * # make
52 * # cd ../..
53 * # make; make test
54 *
55 * Q. V8plus achitecture? What kind of beast is that?
56 * A. Well, it's rather a programming model than an architecture...
57 * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
58 * special conditions, namely when kernel doesn't preserve upper
59 * 32 bits of otherwise 64-bit registers during a context switch.
60 *
61 * Q. Why just UltraSPARC? What about SuperSPARC?
62 * A. Original release did target UltraSPARC only. Now SuperSPARC
63 * version is provided along. Both version share bn_*comba[48]
64 * implementations (see comment later in code for explanation).
65 * But what's so special about this UltraSPARC implementation?
66 * Why didn't I let compiler do the job? Trouble is that most of
67 * available compilers (well, SC5.0 is the only exception) don't
68 * attempt to take advantage of UltraSPARC's 64-bitness under
69 * 32-bit kernels even though it's perfectly possible (see next
70 * question).
71 *
72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
73 * doesn't work?
74 * A. You can't address *all* registers as 64-bit wide:-( The catch is
75 * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
76 * preserved if you're in a leaf function, i.e. such never calling
77 * any other functions. All functions in this module are leaf and
78 * 10 registers is a handful. And as a matter of fact none-"comba"
79 * routines don't require even that much and I could even afford to
80 * not allocate own stack frame for 'em:-)
81 *
82 * Q. What about 64-bit kernels?
83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
84 * under evaluation and development...
85 *
86 * Q. What about shared libraries?
87 * A. What about 'em? Kidding again:-) Code does *not* contain any
88 * code position dependencies and it's safe to include it into
89 * shared library as is.
90 *
91 * Q. How much faster does it go?
92 * A. Do you have a good benchmark? In either case below is what I
93 * experience with crypto/bn/expspeed.c test program:
94 *
95 * v8plus module on U10/300MHz against bn_asm.c compiled with:
96 *
97 * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12%
98 * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35%
99 * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45%
100 *
101 * v8 module on SS10/60MHz against bn_asm.c compiled with:
102 *
103 * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10%
104 * cc-4.2 -xarch=v8 -xO5 -xdepend +10%
105 * egcs-1.1.2 -mv8 -O3 +35-45%
106 *
107 * As you can see it's damn hard to beat the new Sun C compiler
108 * and it's in first place GNU C users who will appreciate this
109 * assembler implementation:-)
110 */
111
112 /*
113 * Revision history.
114 *
115 * 1.0 - initial release;
116 * 1.1 - new loop unrolling model(*);
117 * - some more fine tuning;
118 * 1.2 - made gas friendly;
119 * - updates to documentation concerning v9;
120 * - new performance comparison matrix;
121 * 1.3 - fixed problem with /usr/ccs/lib/cpp;
122 * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient)
123 * resulting in slight overall performance kick;
124 * - some retunes;
125 * - support for GNU as added;
126 *
127 * (*) Originally unrolled loop looked like this:
128 * for (;;) {
129 * op(p+0); if (--n==0) break;
130 * op(p+1); if (--n==0) break;
131 * op(p+2); if (--n==0) break;
132 * op(p+3); if (--n==0) break;
133 * p+=4;
134 * }
135 * I unroll according to following:
136 * while (n&~3) {
137 * op(p+0); op(p+1); op(p+2); op(p+3);
138 * p+=4; n=-4;
139 * }
140 * if (n) {
141 * op(p+0); if (--n==0) return;
142 * op(p+2); if (--n==0) return;
143 * op(p+3); return;
144 * }
145 */
146
147 #ifdef OPENSSL_FIPSCANISTER
148 #include <openssl/fipssyms.h>
149 #endif
150
151 #if defined(__SUNPRO_C) && defined(__sparcv9)
152 /* They've said -xarch=v9 at command line */
153 .register %g2,#scratch
154 .register %g3,#scratch
155 # define FRAME_SIZE -192
156 #elif defined(__GNUC__) && defined(__arch64__)
157 /* They've said -m64 at command line */
158 .register %g2,#scratch
159 .register %g3,#scratch
160 # define FRAME_SIZE -192
161 #else
162 # define FRAME_SIZE -96
163 #endif
164 /*
165 * GNU assembler can't stand stuw:-(
166 */
167 #define stuw st
168
169 .section ".text",#alloc,#execinstr
170 .file "bn_asm.sparc.v8plus.S"
171
172 .align 32
173
174 .global bn_mul_add_words
175 /*
176 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
177 * BN_ULONG *rp,*ap;
178 * int num;
179 * BN_ULONG w;
180 */
181 bn_mul_add_words:
182 sra %o2,%g0,%o2 ! signx %o2
183 brgz,a %o2,.L_bn_mul_add_words_proceed
184 lduw [%o1],%g2
185 retl
186 clr %o0
187 nop
188 nop
189 nop
190
191 .L_bn_mul_add_words_proceed:
192 srl %o3,%g0,%o3 ! clruw %o3
193 andcc %o2,-4,%g0
194 bz,pn %icc,.L_bn_mul_add_words_tail
195 clr %o5
196
197 .L_bn_mul_add_words_loop: ! wow! 32 aligned!
198 lduw [%o0],%g1
199 lduw [%o1+4],%g3
200 mulx %o3,%g2,%g2
201 add %g1,%o5,%o4
202 nop
203 add %o4,%g2,%o4
204 stuw %o4,[%o0]
205 srlx %o4,32,%o5
206
207 lduw [%o0+4],%g1
208 lduw [%o1+8],%g2
209 mulx %o3,%g3,%g3
210 add %g1,%o5,%o4
211 dec 4,%o2
212 add %o4,%g3,%o4
213 stuw %o4,[%o0+4]
214 srlx %o4,32,%o5
215
216 lduw [%o0+8],%g1
217 lduw [%o1+12],%g3
218 mulx %o3,%g2,%g2
219 add %g1,%o5,%o4
220 inc 16,%o1
221 add %o4,%g2,%o4
222 stuw %o4,[%o0+8]
223 srlx %o4,32,%o5
224
225 lduw [%o0+12],%g1
226 mulx %o3,%g3,%g3
227 add %g1,%o5,%o4
228 inc 16,%o0
229 add %o4,%g3,%o4
230 andcc %o2,-4,%g0
231 stuw %o4,[%o0-4]
232 srlx %o4,32,%o5
233 bnz,a,pt %icc,.L_bn_mul_add_words_loop
234 lduw [%o1],%g2
235
236 brnz,a,pn %o2,.L_bn_mul_add_words_tail
237 lduw [%o1],%g2
238 .L_bn_mul_add_words_return:
239 retl
240 mov %o5,%o0
241
242 .L_bn_mul_add_words_tail:
243 lduw [%o0],%g1
244 mulx %o3,%g2,%g2
245 add %g1,%o5,%o4
246 dec %o2
247 add %o4,%g2,%o4
248 srlx %o4,32,%o5
249 brz,pt %o2,.L_bn_mul_add_words_return
250 stuw %o4,[%o0]
251
252 lduw [%o1+4],%g2
253 lduw [%o0+4],%g1
254 mulx %o3,%g2,%g2
255 add %g1,%o5,%o4
256 dec %o2
257 add %o4,%g2,%o4
258 srlx %o4,32,%o5
259 brz,pt %o2,.L_bn_mul_add_words_return
260 stuw %o4,[%o0+4]
261
262 lduw [%o1+8],%g2
263 lduw [%o0+8],%g1
264 mulx %o3,%g2,%g2
265 add %g1,%o5,%o4
266 add %o4,%g2,%o4
267 stuw %o4,[%o0+8]
268 retl
269 srlx %o4,32,%o0
270
271 .type bn_mul_add_words,#function
272 .size bn_mul_add_words,(.-bn_mul_add_words)
273
274 .align 32
275
276 .global bn_mul_words
277 /*
278 * BN_ULONG bn_mul_words(rp,ap,num,w)
279 * BN_ULONG *rp,*ap;
280 * int num;
281 * BN_ULONG w;
282 */
283 bn_mul_words:
284 sra %o2,%g0,%o2 ! signx %o2
285 brgz,a %o2,.L_bn_mul_words_proceeed
286 lduw [%o1],%g2
287 retl
288 clr %o0
289 nop
290 nop
291 nop
292
293 .L_bn_mul_words_proceeed:
294 srl %o3,%g0,%o3 ! clruw %o3
295 andcc %o2,-4,%g0
296 bz,pn %icc,.L_bn_mul_words_tail
297 clr %o5
298
299 .L_bn_mul_words_loop: ! wow! 32 aligned!
300 lduw [%o1+4],%g3
301 mulx %o3,%g2,%g2
302 add %g2,%o5,%o4
303 nop
304 stuw %o4,[%o0]
305 srlx %o4,32,%o5
306
307 lduw [%o1+8],%g2
308 mulx %o3,%g3,%g3
309 add %g3,%o5,%o4
310 dec 4,%o2
311 stuw %o4,[%o0+4]
312 srlx %o4,32,%o5
313
314 lduw [%o1+12],%g3
315 mulx %o3,%g2,%g2
316 add %g2,%o5,%o4
317 inc 16,%o1
318 stuw %o4,[%o0+8]
319 srlx %o4,32,%o5
320
321 mulx %o3,%g3,%g3
322 add %g3,%o5,%o4
323 inc 16,%o0
324 stuw %o4,[%o0-4]
325 srlx %o4,32,%o5
326 andcc %o2,-4,%g0
327 bnz,a,pt %icc,.L_bn_mul_words_loop
328 lduw [%o1],%g2
329 nop
330 nop
331
332 brnz,a,pn %o2,.L_bn_mul_words_tail
333 lduw [%o1],%g2
334 .L_bn_mul_words_return:
335 retl
336 mov %o5,%o0
337
338 .L_bn_mul_words_tail:
339 mulx %o3,%g2,%g2
340 add %g2,%o5,%o4
341 dec %o2
342 srlx %o4,32,%o5
343 brz,pt %o2,.L_bn_mul_words_return
344 stuw %o4,[%o0]
345
346 lduw [%o1+4],%g2
347 mulx %o3,%g2,%g2
348 add %g2,%o5,%o4
349 dec %o2
350 srlx %o4,32,%o5
351 brz,pt %o2,.L_bn_mul_words_return
352 stuw %o4,[%o0+4]
353
354 lduw [%o1+8],%g2
355 mulx %o3,%g2,%g2
356 add %g2,%o5,%o4
357 stuw %o4,[%o0+8]
358 retl
359 srlx %o4,32,%o0
360
361 .type bn_mul_words,#function
362 .size bn_mul_words,(.-bn_mul_words)
363
364 .align 32
365 .global bn_sqr_words
366 /*
367 * void bn_sqr_words(r,a,n)
368 * BN_ULONG *r,*a;
369 * int n;
370 */
371 bn_sqr_words:
372 sra %o2,%g0,%o2 ! signx %o2
373 brgz,a %o2,.L_bn_sqr_words_proceeed
374 lduw [%o1],%g2
375 retl
376 clr %o0
377 nop
378 nop
379 nop
380
381 .L_bn_sqr_words_proceeed:
382 andcc %o2,-4,%g0
383 nop
384 bz,pn %icc,.L_bn_sqr_words_tail
385 nop
386
387 .L_bn_sqr_words_loop: ! wow! 32 aligned!
388 lduw [%o1+4],%g3
389 mulx %g2,%g2,%o4
390 stuw %o4,[%o0]
391 srlx %o4,32,%o5
392 stuw %o5,[%o0+4]
393 nop
394
395 lduw [%o1+8],%g2
396 mulx %g3,%g3,%o4
397 dec 4,%o2
398 stuw %o4,[%o0+8]
399 srlx %o4,32,%o5
400 stuw %o5,[%o0+12]
401
402 lduw [%o1+12],%g3
403 mulx %g2,%g2,%o4
404 srlx %o4,32,%o5
405 stuw %o4,[%o0+16]
406 inc 16,%o1
407 stuw %o5,[%o0+20]
408
409 mulx %g3,%g3,%o4
410 inc 32,%o0
411 stuw %o4,[%o0-8]
412 srlx %o4,32,%o5
413 andcc %o2,-4,%g2
414 stuw %o5,[%o0-4]
415 bnz,a,pt %icc,.L_bn_sqr_words_loop
416 lduw [%o1],%g2
417 nop
418
419 brnz,a,pn %o2,.L_bn_sqr_words_tail
420 lduw [%o1],%g2
421 .L_bn_sqr_words_return:
422 retl
423 clr %o0
424
425 .L_bn_sqr_words_tail:
426 mulx %g2,%g2,%o4
427 dec %o2
428 stuw %o4,[%o0]
429 srlx %o4,32,%o5
430 brz,pt %o2,.L_bn_sqr_words_return
431 stuw %o5,[%o0+4]
432
433 lduw [%o1+4],%g2
434 mulx %g2,%g2,%o4
435 dec %o2
436 stuw %o4,[%o0+8]
437 srlx %o4,32,%o5
438 brz,pt %o2,.L_bn_sqr_words_return
439 stuw %o5,[%o0+12]
440
441 lduw [%o1+8],%g2
442 mulx %g2,%g2,%o4
443 srlx %o4,32,%o5
444 stuw %o4,[%o0+16]
445 stuw %o5,[%o0+20]
446 retl
447 clr %o0
448
449 .type bn_sqr_words,#function
450 .size bn_sqr_words,(.-bn_sqr_words)
451
452 .align 32
453 .global bn_div_words
454 /*
455 * BN_ULONG bn_div_words(h,l,d)
456 * BN_ULONG h,l,d;
457 */
458 bn_div_words:
459 sllx %o0,32,%o0
460 or %o0,%o1,%o0
461 udivx %o0,%o2,%o0
462 retl
463 srl %o0,%g0,%o0 ! clruw %o0
464
465 .type bn_div_words,#function
466 .size bn_div_words,(.-bn_div_words)
467
468 .align 32
469
470 .global bn_add_words
471 /*
472 * BN_ULONG bn_add_words(rp,ap,bp,n)
473 * BN_ULONG *rp,*ap,*bp;
474 * int n;
475 */
476 bn_add_words:
477 sra %o3,%g0,%o3 ! signx %o3
478 brgz,a %o3,.L_bn_add_words_proceed
479 lduw [%o1],%o4
480 retl
481 clr %o0
482
483 .L_bn_add_words_proceed:
484 andcc %o3,-4,%g0
485 bz,pn %icc,.L_bn_add_words_tail
486 addcc %g0,0,%g0 ! clear carry flag
487
488 .L_bn_add_words_loop: ! wow! 32 aligned!
489 dec 4,%o3
490 lduw [%o2],%o5
491 lduw [%o1+4],%g1
492 lduw [%o2+4],%g2
493 lduw [%o1+8],%g3
494 lduw [%o2+8],%g4
495 addccc %o5,%o4,%o5
496 stuw %o5,[%o0]
497
498 lduw [%o1+12],%o4
499 lduw [%o2+12],%o5
500 inc 16,%o1
501 addccc %g1,%g2,%g1
502 stuw %g1,[%o0+4]
503
504 inc 16,%o2
505 addccc %g3,%g4,%g3
506 stuw %g3,[%o0+8]
507
508 inc 16,%o0
509 addccc %o5,%o4,%o5
510 stuw %o5,[%o0-4]
511 and %o3,-4,%g1
512 brnz,a,pt %g1,.L_bn_add_words_loop
513 lduw [%o1],%o4
514
515 brnz,a,pn %o3,.L_bn_add_words_tail
516 lduw [%o1],%o4
517 .L_bn_add_words_return:
518 clr %o0
519 retl
520 movcs %icc,1,%o0
521 nop
522
523 .L_bn_add_words_tail:
524 lduw [%o2],%o5
525 dec %o3
526 addccc %o5,%o4,%o5
527 brz,pt %o3,.L_bn_add_words_return
528 stuw %o5,[%o0]
529
530 lduw [%o1+4],%o4
531 lduw [%o2+4],%o5
532 dec %o3
533 addccc %o5,%o4,%o5
534 brz,pt %o3,.L_bn_add_words_return
535 stuw %o5,[%o0+4]
536
537 lduw [%o1+8],%o4
538 lduw [%o2+8],%o5
539 addccc %o5,%o4,%o5
540 stuw %o5,[%o0+8]
541 clr %o0
542 retl
543 movcs %icc,1,%o0
544
545 .type bn_add_words,#function
546 .size bn_add_words,(.-bn_add_words)
547
548 .global bn_sub_words
549 /*
550 * BN_ULONG bn_sub_words(rp,ap,bp,n)
551 * BN_ULONG *rp,*ap,*bp;
552 * int n;
553 */
554 bn_sub_words:
555 sra %o3,%g0,%o3 ! signx %o3
556 brgz,a %o3,.L_bn_sub_words_proceed
557 lduw [%o1],%o4
558 retl
559 clr %o0
560
561 .L_bn_sub_words_proceed:
562 andcc %o3,-4,%g0
563 bz,pn %icc,.L_bn_sub_words_tail
564 addcc %g0,0,%g0 ! clear carry flag
565
566 .L_bn_sub_words_loop: ! wow! 32 aligned!
567 dec 4,%o3
568 lduw [%o2],%o5
569 lduw [%o1+4],%g1
570 lduw [%o2+4],%g2
571 lduw [%o1+8],%g3
572 lduw [%o2+8],%g4
573 subccc %o4,%o5,%o5
574 stuw %o5,[%o0]
575
576 lduw [%o1+12],%o4
577 lduw [%o2+12],%o5
578 inc 16,%o1
579 subccc %g1,%g2,%g2
580 stuw %g2,[%o0+4]
581
582 inc 16,%o2
583 subccc %g3,%g4,%g4
584 stuw %g4,[%o0+8]
585
586 inc 16,%o0
587 subccc %o4,%o5,%o5
588 stuw %o5,[%o0-4]
589 and %o3,-4,%g1
590 brnz,a,pt %g1,.L_bn_sub_words_loop
591 lduw [%o1],%o4
592
593 brnz,a,pn %o3,.L_bn_sub_words_tail
594 lduw [%o1],%o4
595 .L_bn_sub_words_return:
596 clr %o0
597 retl
598 movcs %icc,1,%o0
599 nop
600
601 .L_bn_sub_words_tail: ! wow! 32 aligned!
602 lduw [%o2],%o5
603 dec %o3
604 subccc %o4,%o5,%o5
605 brz,pt %o3,.L_bn_sub_words_return
606 stuw %o5,[%o0]
607
608 lduw [%o1+4],%o4
609 lduw [%o2+4],%o5
610 dec %o3
611 subccc %o4,%o5,%o5
612 brz,pt %o3,.L_bn_sub_words_return
613 stuw %o5,[%o0+4]
614
615 lduw [%o1+8],%o4
616 lduw [%o2+8],%o5
617 subccc %o4,%o5,%o5
618 stuw %o5,[%o0+8]
619 clr %o0
620 retl
621 movcs %icc,1,%o0
622
623 .type bn_sub_words,#function
624 .size bn_sub_words,(.-bn_sub_words)
625
626 /*
627 * Code below depends on the fact that upper parts of the %l0-%l7
628 * and %i0-%i7 are zeroed by kernel after context switch. In
629 * previous versions this comment stated that "the trouble is that
630 * it's not feasible to implement the mumbo-jumbo in less V9
631 * instructions:-(" which apparently isn't true thanks to
632 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
633 * results not from the shorter code, but from elimination of
634 * multicycle none-pairable 'rd %y,%rd' instructions.
635 *
636 * Andy.
637 */
638
639 /*
640 * Here is register usage map for *all* routines below.
641 */
642 #define t_1 %o0
643 #define t_2 %o1
644 #define c_12 %o2
645 #define c_3 %o3
646
647 #define ap(I) [%i1+4*I]
648 #define bp(I) [%i2+4*I]
649 #define rp(I) [%i0+4*I]
650
651 #define a_0 %l0
652 #define a_1 %l1
653 #define a_2 %l2
654 #define a_3 %l3
655 #define a_4 %l4
656 #define a_5 %l5
657 #define a_6 %l6
658 #define a_7 %l7
659
660 #define b_0 %i3
661 #define b_1 %i4
662 #define b_2 %i5
663 #define b_3 %o4
664 #define b_4 %o5
665 #define b_5 %o7
666 #define b_6 %g1
667 #define b_7 %g4
668
669 .align 32
670 .global bn_mul_comba8
671 /*
672 * void bn_mul_comba8(r,a,b)
673 * BN_ULONG *r,*a,*b;
674 */
675 bn_mul_comba8:
676 save %sp,FRAME_SIZE,%sp
677 mov 1,t_2
678 lduw ap(0),a_0
679 sllx t_2,32,t_2
680 lduw bp(0),b_0 !=
681 lduw bp(1),b_1
682 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
683 srlx t_1,32,c_12
684 stuw t_1,rp(0) !=!r[0]=c1;
685
686 lduw ap(1),a_1
687 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
688 addcc c_12,t_1,c_12
689 clr c_3 !=
690 bcs,a %xcc,.+8
691 add c_3,t_2,c_3
692 lduw ap(2),a_2
693 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
694 addcc c_12,t_1,t_1
695 bcs,a %xcc,.+8
696 add c_3,t_2,c_3
697 srlx t_1,32,c_12 !=
698 stuw t_1,rp(1) !r[1]=c2;
699 or c_12,c_3,c_12
700
701 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
702 addcc c_12,t_1,c_12 !=
703 clr c_3
704 bcs,a %xcc,.+8
705 add c_3,t_2,c_3
706 lduw bp(2),b_2 !=
707 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
708 addcc c_12,t_1,c_12
709 bcs,a %xcc,.+8
710 add c_3,t_2,c_3 !=
711 lduw bp(3),b_3
712 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
713 addcc c_12,t_1,t_1
714 bcs,a %xcc,.+8 !=
715 add c_3,t_2,c_3
716 srlx t_1,32,c_12
717 stuw t_1,rp(2) !r[2]=c3;
718 or c_12,c_3,c_12 !=
719
720 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
721 addcc c_12,t_1,c_12
722 clr c_3
723 bcs,a %xcc,.+8 !=
724 add c_3,t_2,c_3
725 mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
726 addcc c_12,t_1,c_12
727 bcs,a %xcc,.+8 !=
728 add c_3,t_2,c_3
729 lduw ap(3),a_3
730 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
731 addcc c_12,t_1,c_12 !=
732 bcs,a %xcc,.+8
733 add c_3,t_2,c_3
734 lduw ap(4),a_4
735 mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!=
736 addcc c_12,t_1,t_1
737 bcs,a %xcc,.+8
738 add c_3,t_2,c_3
739 srlx t_1,32,c_12 !=
740 stuw t_1,rp(3) !r[3]=c1;
741 or c_12,c_3,c_12
742
743 mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
744 addcc c_12,t_1,c_12 !=
745 clr c_3
746 bcs,a %xcc,.+8
747 add c_3,t_2,c_3
748 mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1);
749 addcc c_12,t_1,c_12
750 bcs,a %xcc,.+8
751 add c_3,t_2,c_3
752 mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
753 addcc c_12,t_1,c_12
754 bcs,a %xcc,.+8
755 add c_3,t_2,c_3
756 lduw bp(4),b_4 !=
757 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
758 addcc c_12,t_1,c_12
759 bcs,a %xcc,.+8
760 add c_3,t_2,c_3 !=
761 lduw bp(5),b_5
762 mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1);
763 addcc c_12,t_1,t_1
764 bcs,a %xcc,.+8 !=
765 add c_3,t_2,c_3
766 srlx t_1,32,c_12
767 stuw t_1,rp(4) !r[4]=c2;
768 or c_12,c_3,c_12 !=
769
770 mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
771 addcc c_12,t_1,c_12
772 clr c_3
773 bcs,a %xcc,.+8 !=
774 add c_3,t_2,c_3
775 mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
776 addcc c_12,t_1,c_12
777 bcs,a %xcc,.+8 !=
778 add c_3,t_2,c_3
779 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
780 addcc c_12,t_1,c_12
781 bcs,a %xcc,.+8 !=
782 add c_3,t_2,c_3
783 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
784 addcc c_12,t_1,c_12
785 bcs,a %xcc,.+8 !=
786 add c_3,t_2,c_3
787 lduw ap(5),a_5
788 mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
789 addcc c_12,t_1,c_12 !=
790 bcs,a %xcc,.+8
791 add c_3,t_2,c_3
792 lduw ap(6),a_6
793 mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2);
794 addcc c_12,t_1,t_1
795 bcs,a %xcc,.+8
796 add c_3,t_2,c_3
797 srlx t_1,32,c_12 !=
798 stuw t_1,rp(5) !r[5]=c3;
799 or c_12,c_3,c_12
800
801 mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
802 addcc c_12,t_1,c_12 !=
803 clr c_3
804 bcs,a %xcc,.+8
805 add c_3,t_2,c_3
806 mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
807 addcc c_12,t_1,c_12
808 bcs,a %xcc,.+8
809 add c_3,t_2,c_3
810 mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3);
811 addcc c_12,t_1,c_12
812 bcs,a %xcc,.+8
813 add c_3,t_2,c_3
814 mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3);
815 addcc c_12,t_1,c_12
816 bcs,a %xcc,.+8
817 add c_3,t_2,c_3
818 mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3);
819 addcc c_12,t_1,c_12
820 bcs,a %xcc,.+8
821 add c_3,t_2,c_3
822 lduw bp(6),b_6 !=
823 mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
824 addcc c_12,t_1,c_12
825 bcs,a %xcc,.+8
826 add c_3,t_2,c_3 !=
827 lduw bp(7),b_7
828 mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
829 addcc c_12,t_1,t_1
830 bcs,a %xcc,.+8 !=
831 add c_3,t_2,c_3
832 srlx t_1,32,c_12
833 stuw t_1,rp(6) !r[6]=c1;
834 or c_12,c_3,c_12 !=
835
836 mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
837 addcc c_12,t_1,c_12
838 clr c_3
839 bcs,a %xcc,.+8 !=
840 add c_3,t_2,c_3
841 mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
842 addcc c_12,t_1,c_12
843 bcs,a %xcc,.+8 !=
844 add c_3,t_2,c_3
845 mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
846 addcc c_12,t_1,c_12
847 bcs,a %xcc,.+8 !=
848 add c_3,t_2,c_3
849 mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1);
850 addcc c_12,t_1,c_12
851 bcs,a %xcc,.+8 !=
852 add c_3,t_2,c_3
853 mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
854 addcc c_12,t_1,c_12
855 bcs,a %xcc,.+8 !=
856 add c_3,t_2,c_3
857 mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
858 addcc c_12,t_1,c_12
859 bcs,a %xcc,.+8 !=
860 add c_3,t_2,c_3
861 lduw ap(7),a_7
862 mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
863 addcc c_12,t_1,c_12
864 bcs,a %xcc,.+8
865 add c_3,t_2,c_3
866 mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1);
867 addcc c_12,t_1,t_1
868 bcs,a %xcc,.+8
869 add c_3,t_2,c_3
870 srlx t_1,32,c_12 !=
871 stuw t_1,rp(7) !r[7]=c2;
872 or c_12,c_3,c_12
873
874 mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2);
875 addcc c_12,t_1,c_12
876 clr c_3
877 bcs,a %xcc,.+8
878 add c_3,t_2,c_3 !=
879 mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2);
880 addcc c_12,t_1,c_12
881 bcs,a %xcc,.+8
882 add c_3,t_2,c_3 !=
883 mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
884 addcc c_12,t_1,c_12
885 bcs,a %xcc,.+8
886 add c_3,t_2,c_3 !=
887 mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
888 addcc c_12,t_1,c_12
889 bcs,a %xcc,.+8
890 add c_3,t_2,c_3 !=
891 mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
892 addcc c_12,t_1,c_12
893 bcs,a %xcc,.+8
894 add c_3,t_2,c_3 !=
895 mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2);
896 addcc c_12,t_1,c_12
897 bcs,a %xcc,.+8
898 add c_3,t_2,c_3 !=
899 mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
900 addcc c_12,t_1,t_1
901 bcs,a %xcc,.+8
902 add c_3,t_2,c_3 !=
903 srlx t_1,32,c_12
904 stuw t_1,rp(8) !r[8]=c3;
905 or c_12,c_3,c_12
906
907 mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3);
908 addcc c_12,t_1,c_12
909 clr c_3
910 bcs,a %xcc,.+8
911 add c_3,t_2,c_3 !=
912 mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3);
913 addcc c_12,t_1,c_12
914 bcs,a %xcc,.+8 !=
915 add c_3,t_2,c_3
916 mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
917 addcc c_12,t_1,c_12
918 bcs,a %xcc,.+8 !=
919 add c_3,t_2,c_3
920 mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
921 addcc c_12,t_1,c_12
922 bcs,a %xcc,.+8 !=
923 add c_3,t_2,c_3
924 mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
925 addcc c_12,t_1,c_12
926 bcs,a %xcc,.+8 !=
927 add c_3,t_2,c_3
928 mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3);
929 addcc c_12,t_1,t_1
930 bcs,a %xcc,.+8 !=
931 add c_3,t_2,c_3
932 srlx t_1,32,c_12
933 stuw t_1,rp(9) !r[9]=c1;
934 or c_12,c_3,c_12 !=
935
936 mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
937 addcc c_12,t_1,c_12
938 clr c_3
939 bcs,a %xcc,.+8 !=
940 add c_3,t_2,c_3
941 mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
942 addcc c_12,t_1,c_12
943 bcs,a %xcc,.+8 !=
944 add c_3,t_2,c_3
945 mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1);
946 addcc c_12,t_1,c_12
947 bcs,a %xcc,.+8 !=
948 add c_3,t_2,c_3
949 mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
950 addcc c_12,t_1,c_12
951 bcs,a %xcc,.+8 !=
952 add c_3,t_2,c_3
953 mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
954 addcc c_12,t_1,t_1
955 bcs,a %xcc,.+8 !=
956 add c_3,t_2,c_3
957 srlx t_1,32,c_12
958 stuw t_1,rp(10) !r[10]=c2;
959 or c_12,c_3,c_12 !=
960
961 mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2);
962 addcc c_12,t_1,c_12
963 clr c_3
964 bcs,a %xcc,.+8 !=
965 add c_3,t_2,c_3
966 mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
967 addcc c_12,t_1,c_12
968 bcs,a %xcc,.+8 !=
969 add c_3,t_2,c_3
970 mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
971 addcc c_12,t_1,c_12
972 bcs,a %xcc,.+8 !=
973 add c_3,t_2,c_3
974 mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
975 addcc c_12,t_1,t_1
976 bcs,a %xcc,.+8 !=
977 add c_3,t_2,c_3
978 srlx t_1,32,c_12
979 stuw t_1,rp(11) !r[11]=c3;
980 or c_12,c_3,c_12 !=
981
982 mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
983 addcc c_12,t_1,c_12
984 clr c_3
985 bcs,a %xcc,.+8 !=
986 add c_3,t_2,c_3
987 mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
988 addcc c_12,t_1,c_12
989 bcs,a %xcc,.+8 !=
990 add c_3,t_2,c_3
991 mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
992 addcc c_12,t_1,t_1
993 bcs,a %xcc,.+8 !=
994 add c_3,t_2,c_3
995 srlx t_1,32,c_12
996 stuw t_1,rp(12) !r[12]=c1;
997 or c_12,c_3,c_12 !=
998
999 mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
1000 addcc c_12,t_1,c_12
1001 clr c_3
1002 bcs,a %xcc,.+8 !=
1003 add c_3,t_2,c_3
1004 mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
1005 addcc c_12,t_1,t_1
1006 bcs,a %xcc,.+8 !=
1007 add c_3,t_2,c_3
1008 srlx t_1,32,c_12
1009 st t_1,rp(13) !r[13]=c2;
1010 or c_12,c_3,c_12 !=
1011
1012 mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2);
1013 addcc c_12,t_1,t_1
1014 srlx t_1,32,c_12 !=
1015 stuw t_1,rp(14) !r[14]=c3;
1016 stuw c_12,rp(15) !r[15]=c1;
1017
1018 ret
1019 restore %g0,%g0,%o0 !=
1020
1021 .type bn_mul_comba8,#function
1022 .size bn_mul_comba8,(.-bn_mul_comba8)
1023
1024 .align 32
1025
1026 .global bn_mul_comba4
1027 /*
1028 * void bn_mul_comba4(r,a,b)
1029 * BN_ULONG *r,*a,*b;
1030 */
1031 bn_mul_comba4:
1032 save %sp,FRAME_SIZE,%sp
1033 lduw ap(0),a_0
1034 mov 1,t_2
1035 lduw bp(0),b_0
1036 sllx t_2,32,t_2 !=
1037 lduw bp(1),b_1
1038 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3);
1039 srlx t_1,32,c_12
1040 stuw t_1,rp(0) !=!r[0]=c1;
1041
1042 lduw ap(1),a_1
1043 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1);
1044 addcc c_12,t_1,c_12
1045 clr c_3 !=
1046 bcs,a %xcc,.+8
1047 add c_3,t_2,c_3
1048 lduw ap(2),a_2
1049 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
1050 addcc c_12,t_1,t_1
1051 bcs,a %xcc,.+8
1052 add c_3,t_2,c_3
1053 srlx t_1,32,c_12 !=
1054 stuw t_1,rp(1) !r[1]=c2;
1055 or c_12,c_3,c_12
1056
1057 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
1058 addcc c_12,t_1,c_12 !=
1059 clr c_3
1060 bcs,a %xcc,.+8
1061 add c_3,t_2,c_3
1062 lduw bp(2),b_2 !=
1063 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
1064 addcc c_12,t_1,c_12
1065 bcs,a %xcc,.+8
1066 add c_3,t_2,c_3 !=
1067 lduw bp(3),b_3
1068 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
1069 addcc c_12,t_1,t_1
1070 bcs,a %xcc,.+8 !=
1071 add c_3,t_2,c_3
1072 srlx t_1,32,c_12
1073 stuw t_1,rp(2) !r[2]=c3;
1074 or c_12,c_3,c_12 !=
1075
1076 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
1077 addcc c_12,t_1,c_12
1078 clr c_3
1079 bcs,a %xcc,.+8 !=
1080 add c_3,t_2,c_3
1081 mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
1082 addcc c_12,t_1,c_12
1083 bcs,a %xcc,.+8 !=
1084 add c_3,t_2,c_3
1085 lduw ap(3),a_3
1086 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
1087 addcc c_12,t_1,c_12 !=
1088 bcs,a %xcc,.+8
1089 add c_3,t_2,c_3
1090 mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
1091 addcc c_12,t_1,t_1 !=
1092 bcs,a %xcc,.+8
1093 add c_3,t_2,c_3
1094 srlx t_1,32,c_12
1095 stuw t_1,rp(3) !=!r[3]=c1;
1096 or c_12,c_3,c_12
1097
1098 mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
1099 addcc c_12,t_1,c_12
1100 clr c_3 !=
1101 bcs,a %xcc,.+8
1102 add c_3,t_2,c_3
1103 mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
1104 addcc c_12,t_1,c_12 !=
1105 bcs,a %xcc,.+8
1106 add c_3,t_2,c_3
1107 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
1108 addcc c_12,t_1,t_1 !=
1109 bcs,a %xcc,.+8
1110 add c_3,t_2,c_3
1111 srlx t_1,32,c_12
1112 stuw t_1,rp(4) !=!r[4]=c2;
1113 or c_12,c_3,c_12
1114
1115 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
1116 addcc c_12,t_1,c_12
1117 clr c_3 !=
1118 bcs,a %xcc,.+8
1119 add c_3,t_2,c_3
1120 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
1121 addcc c_12,t_1,t_1 !=
1122 bcs,a %xcc,.+8
1123 add c_3,t_2,c_3
1124 srlx t_1,32,c_12
1125 stuw t_1,rp(5) !=!r[5]=c3;
1126 or c_12,c_3,c_12
1127
1128 mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
1129 addcc c_12,t_1,t_1
1130 srlx t_1,32,c_12 !=
1131 stuw t_1,rp(6) !r[6]=c1;
1132 stuw c_12,rp(7) !r[7]=c2;
1133
1134 ret
1135 restore %g0,%g0,%o0
1136
1137 .type bn_mul_comba4,#function
1138 .size bn_mul_comba4,(.-bn_mul_comba4)
1139
1140 .align 32
1141
1142 .global bn_sqr_comba8
1143 bn_sqr_comba8:
1144 save %sp,FRAME_SIZE,%sp
1145 mov 1,t_2
1146 lduw ap(0),a_0
1147 sllx t_2,32,t_2
1148 lduw ap(1),a_1
1149 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
1150 srlx t_1,32,c_12
1151 stuw t_1,rp(0) !r[0]=c1;
1152
1153 lduw ap(2),a_2
1154 mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1155 addcc c_12,t_1,c_12
1156 clr c_3
1157 bcs,a %xcc,.+8
1158 add c_3,t_2,c_3
1159 addcc c_12,t_1,t_1
1160 bcs,a %xcc,.+8
1161 add c_3,t_2,c_3
1162 srlx t_1,32,c_12
1163 stuw t_1,rp(1) !r[1]=c2;
1164 or c_12,c_3,c_12
1165
1166 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1167 addcc c_12,t_1,c_12
1168 clr c_3
1169 bcs,a %xcc,.+8
1170 add c_3,t_2,c_3
1171 addcc c_12,t_1,c_12
1172 bcs,a %xcc,.+8
1173 add c_3,t_2,c_3
1174 lduw ap(3),a_3
1175 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1176 addcc c_12,t_1,t_1
1177 bcs,a %xcc,.+8
1178 add c_3,t_2,c_3
1179 srlx t_1,32,c_12
1180 stuw t_1,rp(2) !r[2]=c3;
1181 or c_12,c_3,c_12
1182
1183 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1184 addcc c_12,t_1,c_12
1185 clr c_3
1186 bcs,a %xcc,.+8
1187 add c_3,t_2,c_3
1188 addcc c_12,t_1,c_12
1189 bcs,a %xcc,.+8
1190 add c_3,t_2,c_3
1191 lduw ap(4),a_4
1192 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1193 addcc c_12,t_1,c_12
1194 bcs,a %xcc,.+8
1195 add c_3,t_2,c_3
1196 addcc c_12,t_1,t_1
1197 bcs,a %xcc,.+8
1198 add c_3,t_2,c_3
1199 srlx t_1,32,c_12
1200 st t_1,rp(3) !r[3]=c1;
1201 or c_12,c_3,c_12
1202
1203 mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
1204 addcc c_12,t_1,c_12
1205 clr c_3
1206 bcs,a %xcc,.+8
1207 add c_3,t_2,c_3
1208 addcc c_12,t_1,c_12
1209 bcs,a %xcc,.+8
1210 add c_3,t_2,c_3
1211 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1212 addcc c_12,t_1,c_12
1213 bcs,a %xcc,.+8
1214 add c_3,t_2,c_3
1215 addcc c_12,t_1,c_12
1216 bcs,a %xcc,.+8
1217 add c_3,t_2,c_3
1218 lduw ap(5),a_5
1219 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1220 addcc c_12,t_1,t_1
1221 bcs,a %xcc,.+8
1222 add c_3,t_2,c_3
1223 srlx t_1,32,c_12
1224 stuw t_1,rp(4) !r[4]=c2;
1225 or c_12,c_3,c_12
1226
1227 mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
1228 addcc c_12,t_1,c_12
1229 clr c_3
1230 bcs,a %xcc,.+8
1231 add c_3,t_2,c_3
1232 addcc c_12,t_1,c_12
1233 bcs,a %xcc,.+8
1234 add c_3,t_2,c_3
1235 mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
1236 addcc c_12,t_1,c_12
1237 bcs,a %xcc,.+8
1238 add c_3,t_2,c_3
1239 addcc c_12,t_1,c_12
1240 bcs,a %xcc,.+8
1241 add c_3,t_2,c_3
1242 lduw ap(6),a_6
1243 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1244 addcc c_12,t_1,c_12
1245 bcs,a %xcc,.+8
1246 add c_3,t_2,c_3
1247 addcc c_12,t_1,t_1
1248 bcs,a %xcc,.+8
1249 add c_3,t_2,c_3
1250 srlx t_1,32,c_12
1251 stuw t_1,rp(5) !r[5]=c3;
1252 or c_12,c_3,c_12
1253
1254 mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
1255 addcc c_12,t_1,c_12
1256 clr c_3
1257 bcs,a %xcc,.+8
1258 add c_3,t_2,c_3
1259 addcc c_12,t_1,c_12
1260 bcs,a %xcc,.+8
1261 add c_3,t_2,c_3
1262 mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
1263 addcc c_12,t_1,c_12
1264 bcs,a %xcc,.+8
1265 add c_3,t_2,c_3
1266 addcc c_12,t_1,c_12
1267 bcs,a %xcc,.+8
1268 add c_3,t_2,c_3
1269 mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
1270 addcc c_12,t_1,c_12
1271 bcs,a %xcc,.+8
1272 add c_3,t_2,c_3
1273 addcc c_12,t_1,c_12
1274 bcs,a %xcc,.+8
1275 add c_3,t_2,c_3
1276 lduw ap(7),a_7
1277 mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
1278 addcc c_12,t_1,t_1
1279 bcs,a %xcc,.+8
1280 add c_3,t_2,c_3
1281 srlx t_1,32,c_12
1282 stuw t_1,rp(6) !r[6]=c1;
1283 or c_12,c_3,c_12
1284
1285 mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
1286 addcc c_12,t_1,c_12
1287 clr c_3
1288 bcs,a %xcc,.+8
1289 add c_3,t_2,c_3
1290 addcc c_12,t_1,c_12
1291 bcs,a %xcc,.+8
1292 add c_3,t_2,c_3
1293 mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
1294 addcc c_12,t_1,c_12
1295 bcs,a %xcc,.+8
1296 add c_3,t_2,c_3
1297 addcc c_12,t_1,c_12
1298 bcs,a %xcc,.+8
1299 add c_3,t_2,c_3
1300 mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
1301 addcc c_12,t_1,c_12
1302 bcs,a %xcc,.+8
1303 add c_3,t_2,c_3
1304 addcc c_12,t_1,c_12
1305 bcs,a %xcc,.+8
1306 add c_3,t_2,c_3
1307 mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
1308 addcc c_12,t_1,c_12
1309 bcs,a %xcc,.+8
1310 add c_3,t_2,c_3
1311 addcc c_12,t_1,t_1
1312 bcs,a %xcc,.+8
1313 add c_3,t_2,c_3
1314 srlx t_1,32,c_12
1315 stuw t_1,rp(7) !r[7]=c2;
1316 or c_12,c_3,c_12
1317
1318 mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
1319 addcc c_12,t_1,c_12
1320 clr c_3
1321 bcs,a %xcc,.+8
1322 add c_3,t_2,c_3
1323 addcc c_12,t_1,c_12
1324 bcs,a %xcc,.+8
1325 add c_3,t_2,c_3
1326 mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
1327 addcc c_12,t_1,c_12
1328 bcs,a %xcc,.+8
1329 add c_3,t_2,c_3
1330 addcc c_12,t_1,c_12
1331 bcs,a %xcc,.+8
1332 add c_3,t_2,c_3
1333 mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
1334 addcc c_12,t_1,c_12
1335 bcs,a %xcc,.+8
1336 add c_3,t_2,c_3
1337 addcc c_12,t_1,c_12
1338 bcs,a %xcc,.+8
1339 add c_3,t_2,c_3
1340 mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
1341 addcc c_12,t_1,t_1
1342 bcs,a %xcc,.+8
1343 add c_3,t_2,c_3
1344 srlx t_1,32,c_12
1345 stuw t_1,rp(8) !r[8]=c3;
1346 or c_12,c_3,c_12
1347
1348 mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
1349 addcc c_12,t_1,c_12
1350 clr c_3
1351 bcs,a %xcc,.+8
1352 add c_3,t_2,c_3
1353 addcc c_12,t_1,c_12
1354 bcs,a %xcc,.+8
1355 add c_3,t_2,c_3
1356 mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
1357 addcc c_12,t_1,c_12
1358 bcs,a %xcc,.+8
1359 add c_3,t_2,c_3
1360 addcc c_12,t_1,c_12
1361 bcs,a %xcc,.+8
1362 add c_3,t_2,c_3
1363 mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
1364 addcc c_12,t_1,c_12
1365 bcs,a %xcc,.+8
1366 add c_3,t_2,c_3
1367 addcc c_12,t_1,t_1
1368 bcs,a %xcc,.+8
1369 add c_3,t_2,c_3
1370 srlx t_1,32,c_12
1371 stuw t_1,rp(9) !r[9]=c1;
1372 or c_12,c_3,c_12
1373
1374 mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
1375 addcc c_12,t_1,c_12
1376 clr c_3
1377 bcs,a %xcc,.+8
1378 add c_3,t_2,c_3
1379 addcc c_12,t_1,c_12
1380 bcs,a %xcc,.+8
1381 add c_3,t_2,c_3
1382 mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
1383 addcc c_12,t_1,c_12
1384 bcs,a %xcc,.+8
1385 add c_3,t_2,c_3
1386 addcc c_12,t_1,c_12
1387 bcs,a %xcc,.+8
1388 add c_3,t_2,c_3
1389 mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
1390 addcc c_12,t_1,t_1
1391 bcs,a %xcc,.+8
1392 add c_3,t_2,c_3
1393 srlx t_1,32,c_12
1394 stuw t_1,rp(10) !r[10]=c2;
1395 or c_12,c_3,c_12
1396
1397 mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2);
1398 addcc c_12,t_1,c_12
1399 clr c_3
1400 bcs,a %xcc,.+8
1401 add c_3,t_2,c_3
1402 addcc c_12,t_1,c_12
1403 bcs,a %xcc,.+8
1404 add c_3,t_2,c_3
1405 mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2);
1406 addcc c_12,t_1,c_12
1407 bcs,a %xcc,.+8
1408 add c_3,t_2,c_3
1409 addcc c_12,t_1,t_1
1410 bcs,a %xcc,.+8
1411 add c_3,t_2,c_3
1412 srlx t_1,32,c_12
1413 stuw t_1,rp(11) !r[11]=c3;
1414 or c_12,c_3,c_12
1415
1416 mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
1417 addcc c_12,t_1,c_12
1418 clr c_3
1419 bcs,a %xcc,.+8
1420 add c_3,t_2,c_3
1421 addcc c_12,t_1,c_12
1422 bcs,a %xcc,.+8
1423 add c_3,t_2,c_3
1424 mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
1425 addcc c_12,t_1,t_1
1426 bcs,a %xcc,.+8
1427 add c_3,t_2,c_3
1428 srlx t_1,32,c_12
1429 stuw t_1,rp(12) !r[12]=c1;
1430 or c_12,c_3,c_12
1431
1432 mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
1433 addcc c_12,t_1,c_12
1434 clr c_3
1435 bcs,a %xcc,.+8
1436 add c_3,t_2,c_3
1437 addcc c_12,t_1,t_1
1438 bcs,a %xcc,.+8
1439 add c_3,t_2,c_3
1440 srlx t_1,32,c_12
1441 stuw t_1,rp(13) !r[13]=c2;
1442 or c_12,c_3,c_12
1443
1444 mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
1445 addcc c_12,t_1,t_1
1446 srlx t_1,32,c_12
1447 stuw t_1,rp(14) !r[14]=c3;
1448 stuw c_12,rp(15) !r[15]=c1;
1449
1450 ret
1451 restore %g0,%g0,%o0
1452
1453 .type bn_sqr_comba8,#function
1454 .size bn_sqr_comba8,(.-bn_sqr_comba8)
1455
1456 .align 32
1457
1458 .global bn_sqr_comba4
1459 /*
1460 * void bn_sqr_comba4(r,a)
1461 * BN_ULONG *r,*a;
1462 */
1463 bn_sqr_comba4:
1464 save %sp,FRAME_SIZE,%sp
1465 mov 1,t_2
1466 lduw ap(0),a_0
1467 sllx t_2,32,t_2
1468 lduw ap(1),a_1
1469 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3);
1470 srlx t_1,32,c_12
1471 stuw t_1,rp(0) !r[0]=c1;
1472
1473 lduw ap(2),a_2
1474 mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1);
1475 addcc c_12,t_1,c_12
1476 clr c_3
1477 bcs,a %xcc,.+8
1478 add c_3,t_2,c_3
1479 addcc c_12,t_1,t_1
1480 bcs,a %xcc,.+8
1481 add c_3,t_2,c_3
1482 srlx t_1,32,c_12
1483 stuw t_1,rp(1) !r[1]=c2;
1484 or c_12,c_3,c_12
1485
1486 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1487 addcc c_12,t_1,c_12
1488 clr c_3
1489 bcs,a %xcc,.+8
1490 add c_3,t_2,c_3
1491 addcc c_12,t_1,c_12
1492 bcs,a %xcc,.+8
1493 add c_3,t_2,c_3
1494 lduw ap(3),a_3
1495 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1496 addcc c_12,t_1,t_1
1497 bcs,a %xcc,.+8
1498 add c_3,t_2,c_3
1499 srlx t_1,32,c_12
1500 stuw t_1,rp(2) !r[2]=c3;
1501 or c_12,c_3,c_12
1502
1503 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1504 addcc c_12,t_1,c_12
1505 clr c_3
1506 bcs,a %xcc,.+8
1507 add c_3,t_2,c_3
1508 addcc c_12,t_1,c_12
1509 bcs,a %xcc,.+8
1510 add c_3,t_2,c_3
1511 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1512 addcc c_12,t_1,c_12
1513 bcs,a %xcc,.+8
1514 add c_3,t_2,c_3
1515 addcc c_12,t_1,t_1
1516 bcs,a %xcc,.+8
1517 add c_3,t_2,c_3
1518 srlx t_1,32,c_12
1519 stuw t_1,rp(3) !r[3]=c1;
1520 or c_12,c_3,c_12
1521
1522 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1523 addcc c_12,t_1,c_12
1524 clr c_3
1525 bcs,a %xcc,.+8
1526 add c_3,t_2,c_3
1527 addcc c_12,t_1,c_12
1528 bcs,a %xcc,.+8
1529 add c_3,t_2,c_3
1530 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1531 addcc c_12,t_1,t_1
1532 bcs,a %xcc,.+8
1533 add c_3,t_2,c_3
1534 srlx t_1,32,c_12
1535 stuw t_1,rp(4) !r[4]=c2;
1536 or c_12,c_3,c_12
1537
1538 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1539 addcc c_12,t_1,c_12
1540 clr c_3
1541 bcs,a %xcc,.+8
1542 add c_3,t_2,c_3
1543 addcc c_12,t_1,t_1
1544 bcs,a %xcc,.+8
1545 add c_3,t_2,c_3
1546 srlx t_1,32,c_12
1547 stuw t_1,rp(5) !r[5]=c3;
1548 or c_12,c_3,c_12
1549
1550 mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
1551 addcc c_12,t_1,t_1
1552 srlx t_1,32,c_12
1553 stuw t_1,rp(6) !r[6]=c1;
1554 stuw c_12,rp(7) !r[7]=c2;
1555
1556 ret
1557 restore %g0,%g0,%o0
1558
1559 .type bn_sqr_comba4,#function
1560 .size bn_sqr_comba4,(.-bn_sqr_comba4)
1561
1562 .align 32