]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/sparcv9-mont.pl
Many spelling fixes/typo's corrected.
[thirdparty/openssl.git] / crypto / bn / asm / sparcv9-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # December 2005
18 #
19 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
20 # for undertaken effort are multiple. First of all, UltraSPARC is not
21 # the whole SPARCv9 universe and other VIS-free implementations deserve
22 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
23 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
24 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
25 # several integrated RSA/DSA accelerator circuits accessible through
26 # kernel driver [only(*)], but having decent user-land software
27 # implementation is important too. Finally, reasons like desire to
28 # experiment with dedicated squaring procedure. Yes, this module
29 # implements one, because it was easiest to draft it in SPARCv9
30 # instructions...
31
32 # (*) Engine accessing the driver in question is on my TODO list.
33 # For reference, accelerator is estimated to give 6 to 10 times
34 # improvement on single-threaded RSA sign. It should be noted
35 # that 6-10x improvement coefficient does not actually mean
36 # something extraordinary in terms of absolute [single-threaded]
37 # performance, as SPARCv9 instruction set is by all means least
38 # suitable for high performance crypto among other 64 bit
39 # platforms. 6-10x factor simply places T1 in same performance
40 # domain as say AMD64 and IA-64. Improvement of RSA verify don't
41 # appear impressive at all, but it's the sign operation which is
42 # far more critical/interesting.
43
44 # You might notice that inner loops are modulo-scheduled:-) This has
45 # essentially negligible impact on UltraSPARC performance, it's
46 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
47 # the advantage... Currently this module surpasses sparcv9a-mont.pl
48 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
49 # module still have hidden potential [see TODO list there], which is
50 # estimated to be larger than 20%...
51
52 $output = pop;
53 open STDOUT,">$output";
54
55 # int bn_mul_mont(
56 $rp="%i0"; # BN_ULONG *rp,
57 $ap="%i1"; # const BN_ULONG *ap,
58 $bp="%i2"; # const BN_ULONG *bp,
59 $np="%i3"; # const BN_ULONG *np,
60 $n0="%i4"; # const BN_ULONG *n0,
61 $num="%i5"; # int num);
62
63 $frame="STACK_FRAME";
64 $bias="STACK_BIAS";
65
66 $car0="%o0";
67 $car1="%o1";
68 $car2="%o2"; # 1 bit
69 $acc0="%o3";
70 $acc1="%o4";
71 $mask="%g1"; # 32 bits, what a waste...
72 $tmp0="%g4";
73 $tmp1="%g5";
74
75 $i="%l0";
76 $j="%l1";
77 $mul0="%l2";
78 $mul1="%l3";
79 $tp="%l4";
80 $apj="%l5";
81 $npj="%l6";
82 $tpj="%l7";
83
84 $fname="bn_mul_mont_int";
85
86 $code=<<___;
87 #include "sparc_arch.h"
88
89 .section ".text",#alloc,#execinstr
90
91 .global $fname
92 .align 32
93 $fname:
94 cmp %o5,4 ! 128 bits minimum
95 bge,pt %icc,.Lenter
96 sethi %hi(0xffffffff),$mask
97 retl
98 clr %o0
99 .align 32
100 .Lenter:
101 save %sp,-$frame,%sp
102 sll $num,2,$num ! num*=4
103 or $mask,%lo(0xffffffff),$mask
104 ld [$n0],$n0
105 cmp $ap,$bp
106 and $num,$mask,$num
107 ld [$bp],$mul0 ! bp[0]
108 nop
109
110 add %sp,$bias,%o7 ! real top of stack
111 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
112 sub %o7,$num,%o7
113 ld [$ap+4],$apj ! ap[1]
114 and %o7,-1024,%o7
115 ld [$np],$car1 ! np[0]
116 sub %o7,$bias,%sp ! alloca
117 ld [$np+4],$npj ! np[1]
118 be,pt SIZE_T_CC,.Lbn_sqr_mont
119 mov 12,$j
120
121 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
122 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
123 and $car0,$mask,$acc0
124 add %sp,$bias+$frame,$tp
125 ld [$ap+8],$apj !prologue!
126
127 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
128 and $mul1,$mask,$mul1
129
130 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
131 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
132 srlx $car0,32,$car0
133 add $acc0,$car1,$car1
134 ld [$np+8],$npj !prologue!
135 srlx $car1,32,$car1
136 mov $tmp0,$acc0 !prologue!
137
138 .L1st:
139 mulx $apj,$mul0,$tmp0
140 mulx $npj,$mul1,$tmp1
141 add $acc0,$car0,$car0
142 ld [$ap+$j],$apj ! ap[j]
143 and $car0,$mask,$acc0
144 add $acc1,$car1,$car1
145 ld [$np+$j],$npj ! np[j]
146 srlx $car0,32,$car0
147 add $acc0,$car1,$car1
148 add $j,4,$j ! j++
149 mov $tmp0,$acc0
150 st $car1,[$tp]
151 cmp $j,$num
152 mov $tmp1,$acc1
153 srlx $car1,32,$car1
154 bl %icc,.L1st
155 add $tp,4,$tp ! tp++
156 !.L1st
157
158 mulx $apj,$mul0,$tmp0 !epilogue!
159 mulx $npj,$mul1,$tmp1
160 add $acc0,$car0,$car0
161 and $car0,$mask,$acc0
162 add $acc1,$car1,$car1
163 srlx $car0,32,$car0
164 add $acc0,$car1,$car1
165 st $car1,[$tp]
166 srlx $car1,32,$car1
167
168 add $tmp0,$car0,$car0
169 and $car0,$mask,$acc0
170 add $tmp1,$car1,$car1
171 srlx $car0,32,$car0
172 add $acc0,$car1,$car1
173 st $car1,[$tp+4]
174 srlx $car1,32,$car1
175
176 add $car0,$car1,$car1
177 st $car1,[$tp+8]
178 srlx $car1,32,$car2
179 \f
180 mov 4,$i ! i++
181 ld [$bp+4],$mul0 ! bp[1]
182 .Louter:
183 add %sp,$bias+$frame,$tp
184 ld [$ap],$car0 ! ap[0]
185 ld [$ap+4],$apj ! ap[1]
186 ld [$np],$car1 ! np[0]
187 ld [$np+4],$npj ! np[1]
188 ld [$tp],$tmp1 ! tp[0]
189 ld [$tp+4],$tpj ! tp[1]
190 mov 12,$j
191
192 mulx $car0,$mul0,$car0
193 mulx $apj,$mul0,$tmp0 !prologue!
194 add $tmp1,$car0,$car0
195 ld [$ap+8],$apj !prologue!
196 and $car0,$mask,$acc0
197
198 mulx $n0,$acc0,$mul1
199 and $mul1,$mask,$mul1
200
201 mulx $car1,$mul1,$car1
202 mulx $npj,$mul1,$acc1 !prologue!
203 srlx $car0,32,$car0
204 add $acc0,$car1,$car1
205 ld [$np+8],$npj !prologue!
206 srlx $car1,32,$car1
207 mov $tmp0,$acc0 !prologue!
208
209 .Linner:
210 mulx $apj,$mul0,$tmp0
211 mulx $npj,$mul1,$tmp1
212 add $tpj,$car0,$car0
213 ld [$ap+$j],$apj ! ap[j]
214 add $acc0,$car0,$car0
215 add $acc1,$car1,$car1
216 ld [$np+$j],$npj ! np[j]
217 and $car0,$mask,$acc0
218 ld [$tp+8],$tpj ! tp[j]
219 srlx $car0,32,$car0
220 add $acc0,$car1,$car1
221 add $j,4,$j ! j++
222 mov $tmp0,$acc0
223 st $car1,[$tp] ! tp[j-1]
224 srlx $car1,32,$car1
225 mov $tmp1,$acc1
226 cmp $j,$num
227 bl %icc,.Linner
228 add $tp,4,$tp ! tp++
229 !.Linner
230
231 mulx $apj,$mul0,$tmp0 !epilogue!
232 mulx $npj,$mul1,$tmp1
233 add $tpj,$car0,$car0
234 add $acc0,$car0,$car0
235 ld [$tp+8],$tpj ! tp[j]
236 and $car0,$mask,$acc0
237 add $acc1,$car1,$car1
238 srlx $car0,32,$car0
239 add $acc0,$car1,$car1
240 st $car1,[$tp] ! tp[j-1]
241 srlx $car1,32,$car1
242
243 add $tpj,$car0,$car0
244 add $tmp0,$car0,$car0
245 and $car0,$mask,$acc0
246 add $tmp1,$car1,$car1
247 add $acc0,$car1,$car1
248 st $car1,[$tp+4] ! tp[j-1]
249 srlx $car0,32,$car0
250 add $i,4,$i ! i++
251 srlx $car1,32,$car1
252
253 add $car0,$car1,$car1
254 cmp $i,$num
255 add $car2,$car1,$car1
256 st $car1,[$tp+8]
257
258 srlx $car1,32,$car2
259 bl,a %icc,.Louter
260 ld [$bp+$i],$mul0 ! bp[i]
261 !.Louter
262
263 add $tp,12,$tp
264 \f
265 .Ltail:
266 add $np,$num,$np
267 add $rp,$num,$rp
268 mov $tp,$ap
269 sub %g0,$num,%o7 ! k=-num
270 ba .Lsub
271 subcc %g0,%g0,%g0 ! clear %icc.c
272 .align 16
273 .Lsub:
274 ld [$tp+%o7],%o0
275 ld [$np+%o7],%o1
276 subccc %o0,%o1,%o1 ! tp[j]-np[j]
277 add $rp,%o7,$i
278 add %o7,4,%o7
279 brnz %o7,.Lsub
280 st %o1,[$i]
281 subc $car2,0,$car2 ! handle upmost overflow bit
282 and $tp,$car2,$ap
283 andn $rp,$car2,$np
284 or $ap,$np,$ap
285 sub %g0,$num,%o7
286
287 .Lcopy:
288 ld [$ap+%o7],%o0 ! copy or in-place refresh
289 st %g0,[$tp+%o7] ! zap tp
290 st %o0,[$rp+%o7]
291 add %o7,4,%o7
292 brnz %o7,.Lcopy
293 nop
294 mov 1,%i0
295 ret
296 restore
297 ___
298 \f
299 ########
300 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
301 ######## code without following dedicated squaring procedure.
302 ########
303 $sbit="%o5";
304
305 $code.=<<___;
306 .align 32
307 .Lbn_sqr_mont:
308 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
309 mulx $apj,$mul0,$tmp0 !prologue!
310 and $car0,$mask,$acc0
311 add %sp,$bias+$frame,$tp
312 ld [$ap+8],$apj !prologue!
313
314 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
315 srlx $car0,32,$car0
316 and $mul1,$mask,$mul1
317
318 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
319 mulx $npj,$mul1,$acc1 !prologue!
320 and $car0,1,$sbit
321 ld [$np+8],$npj !prologue!
322 srlx $car0,1,$car0
323 add $acc0,$car1,$car1
324 srlx $car1,32,$car1
325 mov $tmp0,$acc0 !prologue!
326
327 .Lsqr_1st:
328 mulx $apj,$mul0,$tmp0
329 mulx $npj,$mul1,$tmp1
330 add $acc0,$car0,$car0 ! ap[j]*a0+c0
331 add $acc1,$car1,$car1
332 ld [$ap+$j],$apj ! ap[j]
333 and $car0,$mask,$acc0
334 ld [$np+$j],$npj ! np[j]
335 srlx $car0,32,$car0
336 add $acc0,$acc0,$acc0
337 or $sbit,$acc0,$acc0
338 mov $tmp1,$acc1
339 srlx $acc0,32,$sbit
340 add $j,4,$j ! j++
341 and $acc0,$mask,$acc0
342 cmp $j,$num
343 add $acc0,$car1,$car1
344 st $car1,[$tp]
345 mov $tmp0,$acc0
346 srlx $car1,32,$car1
347 bl %icc,.Lsqr_1st
348 add $tp,4,$tp ! tp++
349 !.Lsqr_1st
350
351 mulx $apj,$mul0,$tmp0 ! epilogue
352 mulx $npj,$mul1,$tmp1
353 add $acc0,$car0,$car0 ! ap[j]*a0+c0
354 add $acc1,$car1,$car1
355 and $car0,$mask,$acc0
356 srlx $car0,32,$car0
357 add $acc0,$acc0,$acc0
358 or $sbit,$acc0,$acc0
359 srlx $acc0,32,$sbit
360 and $acc0,$mask,$acc0
361 add $acc0,$car1,$car1
362 st $car1,[$tp]
363 srlx $car1,32,$car1
364
365 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
366 add $tmp1,$car1,$car1
367 and $car0,$mask,$acc0
368 srlx $car0,32,$car0
369 add $acc0,$acc0,$acc0
370 or $sbit,$acc0,$acc0
371 srlx $acc0,32,$sbit
372 and $acc0,$mask,$acc0
373 add $acc0,$car1,$car1
374 st $car1,[$tp+4]
375 srlx $car1,32,$car1
376
377 add $car0,$car0,$car0
378 or $sbit,$car0,$car0
379 add $car0,$car1,$car1
380 st $car1,[$tp+8]
381 srlx $car1,32,$car2
382 \f
383 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
384 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
385 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
386 ld [$ap+4],$mul0 ! ap[1]
387 ld [$ap+8],$apj ! ap[2]
388 ld [$np],$car1 ! np[0]
389 ld [$np+4],$npj ! np[1]
390 mulx $n0,$tmp0,$mul1
391
392 mulx $mul0,$mul0,$car0
393 and $mul1,$mask,$mul1
394
395 mulx $car1,$mul1,$car1
396 mulx $npj,$mul1,$acc1
397 add $tmp0,$car1,$car1
398 and $car0,$mask,$acc0
399 ld [$np+8],$npj ! np[2]
400 srlx $car1,32,$car1
401 add $tmp1,$car1,$car1
402 srlx $car0,32,$car0
403 add $acc0,$car1,$car1
404 and $car0,1,$sbit
405 add $acc1,$car1,$car1
406 srlx $car0,1,$car0
407 mov 12,$j
408 st $car1,[%sp+$bias+$frame] ! tp[0]=
409 srlx $car1,32,$car1
410 add %sp,$bias+$frame+4,$tp
411
412 .Lsqr_2nd:
413 mulx $apj,$mul0,$acc0
414 mulx $npj,$mul1,$acc1
415 add $acc0,$car0,$car0
416 add $tpj,$sbit,$sbit
417 ld [$ap+$j],$apj ! ap[j]
418 and $car0,$mask,$acc0
419 ld [$np+$j],$npj ! np[j]
420 srlx $car0,32,$car0
421 add $acc1,$car1,$car1
422 ld [$tp+8],$tpj ! tp[j]
423 add $acc0,$acc0,$acc0
424 add $j,4,$j ! j++
425 add $sbit,$acc0,$acc0
426 srlx $acc0,32,$sbit
427 and $acc0,$mask,$acc0
428 cmp $j,$num
429 add $acc0,$car1,$car1
430 st $car1,[$tp] ! tp[j-1]
431 srlx $car1,32,$car1
432 bl %icc,.Lsqr_2nd
433 add $tp,4,$tp ! tp++
434 !.Lsqr_2nd
435
436 mulx $apj,$mul0,$acc0
437 mulx $npj,$mul1,$acc1
438 add $acc0,$car0,$car0
439 add $tpj,$sbit,$sbit
440 and $car0,$mask,$acc0
441 srlx $car0,32,$car0
442 add $acc1,$car1,$car1
443 add $acc0,$acc0,$acc0
444 add $sbit,$acc0,$acc0
445 srlx $acc0,32,$sbit
446 and $acc0,$mask,$acc0
447 add $acc0,$car1,$car1
448 st $car1,[$tp] ! tp[j-1]
449 srlx $car1,32,$car1
450
451 add $car0,$car0,$car0
452 add $sbit,$car0,$car0
453 add $car0,$car1,$car1
454 add $car2,$car1,$car1
455 st $car1,[$tp+4]
456 srlx $car1,32,$car2
457 \f
458 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
459 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
460 ld [$ap+8],$mul0 ! ap[2]
461 ld [$np],$car1 ! np[0]
462 ld [$np+4],$npj ! np[1]
463 mulx $n0,$tmp1,$mul1
464 and $mul1,$mask,$mul1
465 mov 8,$i
466
467 mulx $mul0,$mul0,$car0
468 mulx $car1,$mul1,$car1
469 and $car0,$mask,$acc0
470 add $tmp1,$car1,$car1
471 srlx $car0,32,$car0
472 add %sp,$bias+$frame,$tp
473 srlx $car1,32,$car1
474 and $car0,1,$sbit
475 srlx $car0,1,$car0
476 mov 4,$j
477
478 .Lsqr_outer:
479 .Lsqr_inner1:
480 mulx $npj,$mul1,$acc1
481 add $tpj,$car1,$car1
482 add $j,4,$j
483 ld [$tp+8],$tpj
484 cmp $j,$i
485 add $acc1,$car1,$car1
486 ld [$np+$j],$npj
487 st $car1,[$tp]
488 srlx $car1,32,$car1
489 bl %icc,.Lsqr_inner1
490 add $tp,4,$tp
491 !.Lsqr_inner1
492
493 add $j,4,$j
494 ld [$ap+$j],$apj ! ap[j]
495 mulx $npj,$mul1,$acc1
496 add $tpj,$car1,$car1
497 ld [$np+$j],$npj ! np[j]
498 add $acc0,$car1,$car1
499 ld [$tp+8],$tpj ! tp[j]
500 add $acc1,$car1,$car1
501 st $car1,[$tp]
502 srlx $car1,32,$car1
503
504 add $j,4,$j
505 cmp $j,$num
506 be,pn %icc,.Lsqr_no_inner2
507 add $tp,4,$tp
508
509 .Lsqr_inner2:
510 mulx $apj,$mul0,$acc0
511 mulx $npj,$mul1,$acc1
512 add $tpj,$sbit,$sbit
513 add $acc0,$car0,$car0
514 ld [$ap+$j],$apj ! ap[j]
515 and $car0,$mask,$acc0
516 ld [$np+$j],$npj ! np[j]
517 srlx $car0,32,$car0
518 add $acc0,$acc0,$acc0
519 ld [$tp+8],$tpj ! tp[j]
520 add $sbit,$acc0,$acc0
521 add $j,4,$j ! j++
522 srlx $acc0,32,$sbit
523 and $acc0,$mask,$acc0
524 cmp $j,$num
525 add $acc0,$car1,$car1
526 add $acc1,$car1,$car1
527 st $car1,[$tp] ! tp[j-1]
528 srlx $car1,32,$car1
529 bl %icc,.Lsqr_inner2
530 add $tp,4,$tp ! tp++
531
532 .Lsqr_no_inner2:
533 mulx $apj,$mul0,$acc0
534 mulx $npj,$mul1,$acc1
535 add $tpj,$sbit,$sbit
536 add $acc0,$car0,$car0
537 and $car0,$mask,$acc0
538 srlx $car0,32,$car0
539 add $acc0,$acc0,$acc0
540 add $sbit,$acc0,$acc0
541 srlx $acc0,32,$sbit
542 and $acc0,$mask,$acc0
543 add $acc0,$car1,$car1
544 add $acc1,$car1,$car1
545 st $car1,[$tp] ! tp[j-1]
546 srlx $car1,32,$car1
547
548 add $car0,$car0,$car0
549 add $sbit,$car0,$car0
550 add $car0,$car1,$car1
551 add $car2,$car1,$car1
552 st $car1,[$tp+4]
553 srlx $car1,32,$car2
554 \f
555 add $i,4,$i ! i++
556 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
557 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
558 ld [$ap+$i],$mul0 ! ap[j]
559 ld [$np],$car1 ! np[0]
560 ld [$np+4],$npj ! np[1]
561 mulx $n0,$tmp1,$mul1
562 and $mul1,$mask,$mul1
563 add $i,4,$tmp0
564
565 mulx $mul0,$mul0,$car0
566 mulx $car1,$mul1,$car1
567 and $car0,$mask,$acc0
568 add $tmp1,$car1,$car1
569 srlx $car0,32,$car0
570 add %sp,$bias+$frame,$tp
571 srlx $car1,32,$car1
572 and $car0,1,$sbit
573 srlx $car0,1,$car0
574
575 cmp $tmp0,$num ! i<num-1
576 bl %icc,.Lsqr_outer
577 mov 4,$j
578 \f
579 .Lsqr_last:
580 mulx $npj,$mul1,$acc1
581 add $tpj,$car1,$car1
582 add $j,4,$j
583 ld [$tp+8],$tpj
584 cmp $j,$i
585 add $acc1,$car1,$car1
586 ld [$np+$j],$npj
587 st $car1,[$tp]
588 srlx $car1,32,$car1
589 bl %icc,.Lsqr_last
590 add $tp,4,$tp
591 !.Lsqr_last
592
593 mulx $npj,$mul1,$acc1
594 add $tpj,$acc0,$acc0
595 srlx $acc0,32,$tmp0
596 and $acc0,$mask,$acc0
597 add $tmp0,$sbit,$sbit
598 add $acc0,$car1,$car1
599 add $acc1,$car1,$car1
600 st $car1,[$tp]
601 srlx $car1,32,$car1
602
603 add $car0,$car0,$car0 ! recover $car0
604 add $sbit,$car0,$car0
605 add $car0,$car1,$car1
606 add $car2,$car1,$car1
607 st $car1,[$tp+4]
608 srlx $car1,32,$car2
609
610 ba .Ltail
611 add $tp,8,$tp
612 .type $fname,#function
613 .size $fname,(.-$fname)
614 .asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
615 .align 32
616 ___
617 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
618 print $code;
619 close STDOUT;