]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/sparcv9-mont.pl
Unify sparcv9 assembler naming and build rules among 32- and 64-bit builds.
[thirdparty/openssl.git] / crypto / bn / asm / sparcv9-mont.pl
CommitLineData
68ea6068
AP
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8
9# December 2005
10#
11# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
12# for undertaken effort are multiple. First of all, UltraSPARC is not
13# the whole SPARCv9 universe and other VIS-free implementations deserve
14# optimized code as much. Secondly, newly introduced UltraSPARC T1,
15# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
16# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
17# several integrated RSA/DSA accelerator circuits accessible through
18# kernel driver [only(*)], but having decent user-land software
19# implementation is important too. Finally, reasons like desire to
20# experiment with dedicated squaring procedure. Yes, this module
21# implements one, because it was easiest to draft it in SPARCv9
22# instructions...
23
24# (*) Engine accessing the driver in question is on my TODO list.
25# For reference, acceleator is estimated to give 6 to 10 times
26# improvement on single-threaded RSA sign. It should be noted
27# that 6-10x improvement coefficient does not actually mean
28# something extraordinary in terms of absolute [single-threaded]
29# performance, as SPARCv9 instruction set is by all means least
30# suitable for high performance crypto among other 64 bit
31# platforms. 6-10x factor simply places T1 in same performance
32# domain as say AMD64 and IA-64. Improvement of RSA verify don't
33# appear impressive at all, but it's the sign operation which is
34# far more critical/interesting.
35
36# You might notice that inner loops are modulo-scheduled:-) This has
37# essentially negligible impact on UltraSPARC performance, it's
38# Fujitsu SPARC64 V users who should notice and hopefully appreciate
39# the advantage... Currently this module surpasses sparcv9a-mont.pl
40# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
41# module still have hidden potential [see TODO list there], which is
42# estimated to be larger than 20%...
43
44# int bn_mul_mont(
45$rp="%i0"; # BN_ULONG *rp,
46$ap="%i1"; # const BN_ULONG *ap,
47$bp="%i2"; # const BN_ULONG *bp,
48$np="%i3"; # const BN_ULONG *np,
49$n0="%i4"; # const BN_ULONG *n0,
50$num="%i5"; # int num);
51
52$bits=32;
53for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
54if ($bits==64) { $bias=2047; $frame=192; }
55else { $bias=0; $frame=128; }
56
57$car0="%o0";
58$car1="%o1";
59$car2="%o2"; # 1 bit
60$acc0="%o3";
61$acc1="%o4";
62$mask="%g1"; # 32 bits, what a waste...
63$tmp0="%g4";
64$tmp1="%g5";
65
66$i="%l0";
67$j="%l1";
68$mul0="%l2";
69$mul1="%l3";
70$tp="%l4";
71$apj="%l5";
72$npj="%l6";
73$tpj="%l7";
74
a00e414f 75$fname="bn_mul_mont_int";
68ea6068
AP
76
77$code=<<___;
78.section ".text",#alloc,#execinstr
79
80.global $fname
81.align 32
82$fname:
83 cmp %o5,4 ! 128 bits minimum
84 bge,pt %icc,.Lenter
85 sethi %hi(0xffffffff),$mask
86 retl
87 clr %o0
88.align 32
89.Lenter:
90 save %sp,-$frame,%sp
91 sll $num,2,$num ! num*=4
92 or $mask,%lo(0xffffffff),$mask
93 ld [$n0],$n0
94 cmp $ap,$bp
95 and $num,$mask,$num
96 ld [$bp],$mul0 ! bp[0]
97 be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
98 nop
99
100 add %sp,$bias,%o7 ! real top of stack
101 ld [$ap],$car0 ! ap[0]
102 sub %o7,$num,%o7
103 ld [$ap+4],$apj ! ap[1]
104 and %o7,-1024,%o7
105 ld [$np],$car1 ! np[0]
106 sub %o7,$bias,%sp ! alloca
107 ld [$np+4],$npj ! np[1]
108 mov 12,$j
109
110 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
111 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
112 and $car0,$mask,$acc0
113 add %sp,$bias+$frame,$tp
114 ld [$ap+8],$apj !prologue!
115
116 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
117 and $mul1,$mask,$mul1
118
119 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
120 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
121 srlx $car0,32,$car0
122 add $acc0,$car1,$car1
123 ld [$np+8],$npj !prologue!
124 srlx $car1,32,$car1
125 mov $tmp0,$acc0 !prologue!
126
127.L1st:
128 mulx $apj,$mul0,$tmp0
129 mulx $npj,$mul1,$tmp1
130 add $acc0,$car0,$car0
131 ld [$ap+$j],$apj ! ap[j]
132 and $car0,$mask,$acc0
133 add $acc1,$car1,$car1
134 ld [$np+$j],$npj ! np[j]
135 srlx $car0,32,$car0
136 add $acc0,$car1,$car1
137 add $j,4,$j ! j++
138 mov $tmp0,$acc0
139 st $car1,[$tp]
140 cmp $j,$num
141 mov $tmp1,$acc1
142 srlx $car1,32,$car1
143 bl %icc,.L1st
144 add $tp,4,$tp ! tp++
145!.L1st
146
147 mulx $apj,$mul0,$tmp0 !epilogue!
148 mulx $npj,$mul1,$tmp1
149 add $acc0,$car0,$car0
150 and $car0,$mask,$acc0
151 add $acc1,$car1,$car1
152 srlx $car0,32,$car0
153 add $acc0,$car1,$car1
154 st $car1,[$tp]
155 srlx $car1,32,$car1
156
157 add $tmp0,$car0,$car0
158 and $car0,$mask,$acc0
159 add $tmp1,$car1,$car1
160 srlx $car0,32,$car0
161 add $acc0,$car1,$car1
162 st $car1,[$tp+4]
163 srlx $car1,32,$car1
164
165 add $car0,$car1,$car1
166 st $car1,[$tp+8]
167 srlx $car1,32,$car2
168\f
169 mov 4,$i ! i++
170 ld [$bp+4],$mul0 ! bp[1]
171.Louter:
172 add %sp,$bias+$frame,$tp
173 ld [$ap],$car0 ! ap[0]
174 ld [$ap+4],$apj ! ap[1]
175 ld [$np],$car1 ! np[0]
176 ld [$np+4],$npj ! np[1]
177 ld [$tp],$tmp1 ! tp[0]
178 ld [$tp+4],$tpj ! tp[1]
179 mov 12,$j
180
181 mulx $car0,$mul0,$car0
182 mulx $apj,$mul0,$tmp0 !prologue!
183 add $tmp1,$car0,$car0
184 ld [$ap+8],$apj !prologue!
185 and $car0,$mask,$acc0
186
187 mulx $n0,$acc0,$mul1
188 and $mul1,$mask,$mul1
189
190 mulx $car1,$mul1,$car1
191 mulx $npj,$mul1,$acc1 !prologue!
192 srlx $car0,32,$car0
193 add $acc0,$car1,$car1
194 ld [$np+8],$npj !prologue!
195 srlx $car1,32,$car1
196 mov $tmp0,$acc0 !prologue!
197
198.Linner:
199 mulx $apj,$mul0,$tmp0
200 mulx $npj,$mul1,$tmp1
201 add $tpj,$car0,$car0
202 ld [$ap+$j],$apj ! ap[j]
203 add $acc0,$car0,$car0
204 add $acc1,$car1,$car1
205 ld [$np+$j],$npj ! np[j]
206 and $car0,$mask,$acc0
207 ld [$tp+8],$tpj ! tp[j]
208 srlx $car0,32,$car0
209 add $acc0,$car1,$car1
210 add $j,4,$j ! j++
211 mov $tmp0,$acc0
212 st $car1,[$tp] ! tp[j-1]
213 srlx $car1,32,$car1
214 mov $tmp1,$acc1
215 cmp $j,$num
216 bl %icc,.Linner
217 add $tp,4,$tp ! tp++
218!.Linner
219
220 mulx $apj,$mul0,$tmp0 !epilogue!
221 mulx $npj,$mul1,$tmp1
222 add $tpj,$car0,$car0
223 add $acc0,$car0,$car0
224 ld [$tp+8],$tpj ! tp[j]
225 and $car0,$mask,$acc0
226 add $acc1,$car1,$car1
227 srlx $car0,32,$car0
228 add $acc0,$car1,$car1
229 st $car1,[$tp] ! tp[j-1]
230 srlx $car1,32,$car1
231
232 add $tpj,$car0,$car0
233 add $tmp0,$car0,$car0
234 and $car0,$mask,$acc0
235 add $tmp1,$car1,$car1
236 add $acc0,$car1,$car1
237 st $car1,[$tp+4] ! tp[j-1]
238 srlx $car0,32,$car0
239 add $i,4,$i ! i++
240 srlx $car1,32,$car1
241
242 add $car0,$car1,$car1
243 cmp $i,$num
244 add $car2,$car1,$car1
245 st $car1,[$tp+8]
246
247 srlx $car1,32,$car2
248 bl,a %icc,.Louter
249 ld [$bp+$i],$mul0 ! bp[i]
250!.Louter
251
252 add $tp,12,$tp
253\f
254.Ltail:
255 add $np,$num,$np
256 add $rp,$num,$rp
257
258 cmp $car2,0 ! clears %icc.c
259 bne,pn %icc,.Lsub
260 sub %g0,$num,%o7 ! k=-num
261
262 cmp $car1,$npj ! compare top-most $tp and $np words
263 bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
264 nop
265
266.align 16,0x1000000
267.Lsub:
268 ld [$tp+%o7],%o0
269 ld [$np+%o7],%o1
270 subccc %o0,%o1,%o1
271 st %o1,[$rp+%o7]
272 add %o7,4,%o7
273 brnz %o7,.Lsub
274 nop
275 subccc $car2,0,$car2
276 bcc %icc,.Lzap
277 sub %g0,$num,%o7
278
279.align 16,0x1000000
280.Lcopy:
281 ld [$tp+%o7],%o0
282 st %o0,[$rp+%o7]
283 add %o7,4,%o7
284 brnz %o7,.Lcopy
285 nop
286 ba .Lzap
287 sub %g0,$num,%o7
288
289.align 32
290.Lzap:
291 st %g0,[$tp+%o7]
292 add %o7,4,%o7
293 brnz %o7,.Lzap
294 nop
295 mov 1,%i0
296 ret
297 restore
298___
299\f
300########
a00e414f
AP
301######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
302######## code without following dedicated squaring procedure.
68ea6068
AP
303########
304$sbit="%i2"; # re-use $bp!
305
306$code.=<<___;
307.align 32
308.Lbn_sqr_mont:
309 add %sp,$bias,%o7 ! real top of stack
310 ld [$ap+4],$apj ! ap[1]
311 sub %o7,$num,%o7
312 ld [$np],$car1 ! np[0]
313 and %o7,-1024,%o7
314 ld [$np+4],$npj ! np[1]
315 sub %o7,$bias,%sp ! alloca
316 mov 12,$j
317
318 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
319 mulx $apj,$mul0,$tmp0 !prologue!
320 and $car0,$mask,$acc0
321 add %sp,$bias+$frame,$tp
322 ld [$ap+8],$apj !prologue!
323
324 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
325 srlx $car0,32,$car0
326 and $mul1,$mask,$mul1
327
328 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
329 mulx $npj,$mul1,$acc1 !prologue!
330 and $car0,1,$sbit
331 ld [$np+8],$npj !prologue!
332 srlx $car0,1,$car0
333 add $acc0,$car1,$car1
334 srlx $car1,32,$car1
335 mov $tmp0,$acc0 !prologue!
336
337.Lsqr_1st:
338 mulx $apj,$mul0,$tmp0
339 mulx $npj,$mul1,$tmp1
340 add $acc0,$car0,$car0 ! ap[j]*a0+c0
341 add $acc1,$car1,$car1
342 ld [$ap+$j],$apj ! ap[j]
343 and $car0,$mask,$acc0
344 ld [$np+$j],$npj ! np[j]
345 srlx $car0,32,$car0
346 add $acc0,$acc0,$acc0
347 or $sbit,$acc0,$acc0
348 mov $tmp1,$acc1
349 srlx $acc0,32,$sbit
350 add $j,4,$j ! j++
351 and $acc0,$mask,$acc0
352 cmp $j,$num
353 add $acc0,$car1,$car1
354 st $car1,[$tp]
355 mov $tmp0,$acc0
356 srlx $car1,32,$car1
357 bl %icc,.Lsqr_1st
358 add $tp,4,$tp ! tp++
359!.Lsqr_1st
360
361 mulx $apj,$mul0,$tmp0 ! epilogue
362 mulx $npj,$mul1,$tmp1
363 add $acc0,$car0,$car0 ! ap[j]*a0+c0
364 add $acc1,$car1,$car1
365 and $car0,$mask,$acc0
366 srlx $car0,32,$car0
367 add $acc0,$acc0,$acc0
368 or $sbit,$acc0,$acc0
369 srlx $acc0,32,$sbit
370 and $acc0,$mask,$acc0
371 add $acc0,$car1,$car1
372 st $car1,[$tp]
373 srlx $car1,32,$car1
374
375 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
376 add $tmp1,$car1,$car1
377 and $car0,$mask,$acc0
378 srlx $car0,32,$car0
379 add $acc0,$acc0,$acc0
380 or $sbit,$acc0,$acc0
381 srlx $acc0,32,$sbit
382 and $acc0,$mask,$acc0
383 add $acc0,$car1,$car1
384 st $car1,[$tp+4]
385 srlx $car1,32,$car1
386
387 add $car0,$car0,$car0
388 or $sbit,$car0,$car0
389 add $car0,$car1,$car1
390 st $car1,[$tp+8]
391 srlx $car1,32,$car2
392\f
393 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
394 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
395 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
396 ld [$ap+4],$mul0 ! ap[1]
397 ld [$ap+8],$apj ! ap[2]
398 ld [$np],$car1 ! np[0]
399 ld [$np+4],$npj ! np[1]
400 mulx $n0,$tmp0,$mul1
401
402 mulx $mul0,$mul0,$car0
403 and $mul1,$mask,$mul1
404
405 mulx $car1,$mul1,$car1
406 mulx $npj,$mul1,$acc1
407 add $tmp0,$car1,$car1
408 and $car0,$mask,$acc0
409 ld [$np+8],$npj ! np[2]
410 srlx $car1,32,$car1
411 add $tmp1,$car1,$car1
412 srlx $car0,32,$car0
413 add $acc0,$car1,$car1
414 and $car0,1,$sbit
415 add $acc1,$car1,$car1
416 srlx $car0,1,$car0
417 mov 12,$j
418 st $car1,[%sp+$bias+$frame] ! tp[0]=
419 srlx $car1,32,$car1
420 add %sp,$bias+$frame+4,$tp
421
422.Lsqr_2nd:
423 mulx $apj,$mul0,$acc0
424 mulx $npj,$mul1,$acc1
425 add $acc0,$car0,$car0
426 add $tpj,$car1,$car1
427 ld [$ap+$j],$apj ! ap[j]
428 and $car0,$mask,$acc0
429 ld [$np+$j],$npj ! np[j]
430 srlx $car0,32,$car0
431 add $acc1,$car1,$car1
432 ld [$tp+8],$tpj ! tp[j]
433 add $acc0,$acc0,$acc0
434 add $j,4,$j ! j++
435 or $sbit,$acc0,$acc0
436 srlx $acc0,32,$sbit
437 and $acc0,$mask,$acc0
438 cmp $j,$num
439 add $acc0,$car1,$car1
440 st $car1,[$tp] ! tp[j-1]
441 srlx $car1,32,$car1
442 bl %icc,.Lsqr_2nd
443 add $tp,4,$tp ! tp++
444!.Lsqr_2nd
445
446 mulx $apj,$mul0,$acc0
447 mulx $npj,$mul1,$acc1
448 add $acc0,$car0,$car0
449 add $tpj,$car1,$car1
450 and $car0,$mask,$acc0
451 srlx $car0,32,$car0
452 add $acc1,$car1,$car1
453 add $acc0,$acc0,$acc0
454 or $sbit,$acc0,$acc0
455 srlx $acc0,32,$sbit
456 and $acc0,$mask,$acc0
457 add $acc0,$car1,$car1
458 st $car1,[$tp] ! tp[j-1]
459 srlx $car1,32,$car1
460
461 add $car0,$car0,$car0
462 or $sbit,$car0,$car0
463 add $car0,$car1,$car1
464 add $car2,$car1,$car1
465 st $car1,[$tp+4]
466 srlx $car1,32,$car2
467\f
468 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
469 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
470 ld [$ap+8],$mul0 ! ap[2]
471 ld [$np],$car1 ! np[0]
472 ld [$np+4],$npj ! np[1]
473 mulx $n0,$tmp1,$mul1
474 and $mul1,$mask,$mul1
475 mov 8,$i
476
477 mulx $mul0,$mul0,$car0
478 mulx $car1,$mul1,$car1
479 and $car0,$mask,$acc0
480 add $tmp1,$car1,$car1
481 srlx $car0,32,$car0
482 add %sp,$bias+$frame,$tp
483 srlx $car1,32,$car1
484 and $car0,1,$sbit
485 srlx $car0,1,$car0
486 mov 4,$j
487
488.Lsqr_outer:
489.Lsqr_inner1:
490 mulx $npj,$mul1,$acc1
491 add $tpj,$car1,$car1
492 add $j,4,$j
493 ld [$tp+8],$tpj
494 cmp $j,$i
495 add $acc1,$car1,$car1
496 ld [$np+$j],$npj
497 st $car1,[$tp]
498 srlx $car1,32,$car1
499 bl %icc,.Lsqr_inner1
500 add $tp,4,$tp
501!.Lsqr_inner1
502
503 add $j,4,$j
504 ld [$ap+$j],$apj ! ap[j]
505 mulx $npj,$mul1,$acc1
506 add $tpj,$car1,$car1
507 ld [$np+$j],$npj ! np[j]
508 add $acc0,$car1,$car1
509 ld [$tp+8],$tpj ! tp[j]
510 add $acc1,$car1,$car1
511 st $car1,[$tp]
512 srlx $car1,32,$car1
513
514 add $j,4,$j
515 cmp $j,$num
516 be,pn %icc,.Lsqr_no_inner2
517 add $tp,4,$tp
518
519.Lsqr_inner2:
520 mulx $apj,$mul0,$acc0
521 mulx $npj,$mul1,$acc1
522 add $tpj,$car1,$car1
523 add $acc0,$car0,$car0
524 ld [$ap+$j],$apj ! ap[j]
525 and $car0,$mask,$acc0
526 ld [$np+$j],$npj ! np[j]
527 srlx $car0,32,$car0
528 add $acc0,$acc0,$acc0
529 ld [$tp+8],$tpj ! tp[j]
530 or $sbit,$acc0,$acc0
531 add $j,4,$j ! j++
532 srlx $acc0,32,$sbit
533 and $acc0,$mask,$acc0
534 cmp $j,$num
535 add $acc0,$car1,$car1
536 add $acc1,$car1,$car1
537 st $car1,[$tp] ! tp[j-1]
538 srlx $car1,32,$car1
539 bl %icc,.Lsqr_inner2
540 add $tp,4,$tp ! tp++
541
542.Lsqr_no_inner2:
543 mulx $apj,$mul0,$acc0
544 mulx $npj,$mul1,$acc1
545 add $tpj,$car1,$car1
546 add $acc0,$car0,$car0
547 and $car0,$mask,$acc0
548 srlx $car0,32,$car0
549 add $acc0,$acc0,$acc0
550 or $sbit,$acc0,$acc0
551 srlx $acc0,32,$sbit
552 and $acc0,$mask,$acc0
553 add $acc0,$car1,$car1
554 add $acc1,$car1,$car1
555 st $car1,[$tp] ! tp[j-1]
556 srlx $car1,32,$car1
557
558 add $car0,$car0,$car0
559 or $sbit,$car0,$car0
560 add $car0,$car1,$car1
561 add $car2,$car1,$car1
562 st $car1,[$tp+4]
563 srlx $car1,32,$car2
564\f
565 add $i,4,$i ! i++
566 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
567 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
568 ld [$ap+$i],$mul0 ! ap[j]
569 ld [$np],$car1 ! np[0]
570 ld [$np+4],$npj ! np[1]
571 mulx $n0,$tmp1,$mul1
572 and $mul1,$mask,$mul1
573 add $i,4,$tmp0
574
575 mulx $mul0,$mul0,$car0
576 mulx $car1,$mul1,$car1
577 and $car0,$mask,$acc0
578 add $tmp1,$car1,$car1
579 srlx $car0,32,$car0
580 add %sp,$bias+$frame,$tp
581 srlx $car1,32,$car1
582 and $car0,1,$sbit
583 srlx $car0,1,$car0
584
585 cmp $tmp0,$num ! i<num-1
586 bl %icc,.Lsqr_outer
587 mov 4,$j
588\f
589.Lsqr_last:
590 mulx $npj,$mul1,$acc1
591 add $tpj,$car1,$car1
592 add $j,4,$j
593 ld [$tp+8],$tpj
594 cmp $j,$i
595 add $acc1,$car1,$car1
596 ld [$np+$j],$npj
597 st $car1,[$tp]
598 srlx $car1,32,$car1
599 bl %icc,.Lsqr_last
600 add $tp,4,$tp
601!.Lsqr_last
602
603 mulx $npj,$mul1,$acc1
604 add $tpj,$car1,$car1
605 add $acc0,$car1,$car1
606 add $acc1,$car1,$car1
607 st $car1,[$tp]
608 srlx $car1,32,$car1
609
610 add $car0,$car0,$car0 ! recover $car0
611 or $sbit,$car0,$car0
612 add $car0,$car1,$car1
613 add $car2,$car1,$car1
614 st $car1,[$tp+4]
615 srlx $car1,32,$car2
616
617 ba .Ltail
618 add $tp,8,$tp
619.type $fname,#function
620.size $fname,(.-$fname)
621___
622$code =~ s/\`([^\`]*)\`/eval($1)/gem;
623print $code;
624close STDOUT;