]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/rsaz-x86_64.pl
crypto/bn/asm/rsax-x86_64.pl: make it work on Darwin.
[thirdparty/openssl.git] / crypto / bn / asm / rsaz-x86_64.pl
1 #!/usr/bin/env perl
2
3 #******************************************************************************#
4 #* Copyright(c) 2012, Intel Corp. *#
5 #* Developers and authors: *#
6 #* Shay Gueron (1, 2), and Vlad Krasnov (1) *#
7 #* (1) Intel Architecture Group, Microprocessor and Chipset Development, *#
8 #* Israel Development Center, Haifa, Israel *#
9 #* (2) University of Haifa *#
10 #******************************************************************************#
11 #* This submission to OpenSSL is to be made available under the OpenSSL *#
12 #* license, and only to the OpenSSL project, in order to allow integration *#
13 #* into the publicly distributed code. ? *#
14 #* The use of this code, or portions of this code, or concepts embedded in *#
15 #* this code, or modification of this code and/or algorithm(s) in it, or the *#
16 #* use of this code for any other purpose than stated above, requires special *#
17 #* licensing. *#
18 #******************************************************************************#
19 #******************************************************************************#
20 #* DISCLAIMER: *#
21 #* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS *#
22 #* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *#
23 #* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *#
24 #* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT*#
25 #* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, *#
26 #* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF *#
27 #* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS *#
28 #* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN *#
29 #* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) *#
30 #* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *#
31 #* POSSIBILITY OF SUCH DAMAGE. *#
32 #******************************************************************************#
33 #* Reference: *#
34 #* [1] S. Gueron, "Efficient Software Implementations of Modular *#
35 #* Exponentiation", http://eprint.iacr.org/2011/239 *#
36 #* [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". *#
37 #* IEEE Proceedings of 9th International Conference on Information *#
38 #* Technology: New Generations (ITNG 2012), 821-823 (2012). *#
39 #* [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation*#
40 #* Journal of Cryptographic Engineering 2:31-43 (2012). *#
41 #* [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis *#
42 #* resistant 512-bit and 1024-bit modular exponentiation for optimizing *#
43 #* RSA1024 and RSA2048 on x86_64 platforms", *#
44 #* http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest*#
45 ################################################################################
46
47 # While original submission covers 512- and 1024-bit exponentiation,
48 # this module is limited to 512-bit version only (and as such
49 # accelerates RSA1024 sign). This is because improvement for longer
50 # keys is not high enough to justify the effort, highest measured
51 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
52 # for the moment of this writing!] Nor does this module implement
53 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
54 # to more modular mixture of C and assembly. And it's optimized even
55 # for processors other than Intel Core family (see table below for
56 # improvement coefficients).
57 # <appro@openssl.org>
58 #
59 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
60 # ----------------+---------------------------
61 # Opteron +13% |+5% +20%
62 # Bulldozer -0% |-1% +10%
63 # P4 +11% |+7% +8%
64 # Westmere +5% |+14% +17%
65 # Sandy Bridge +2% |+12% +29%
66 # Ivy Bridge +1% |+11% +35%
67 # Haswell(**) -0% |+12% +39%
68 # Atom +13% |+11% +4%
69 # VIA Nano +70% |+9% +25%
70 #
71 # (*) rsax engine and fips numbers are presented for reference
72 # purposes;
73 # (**) you might notice MULX code below, strangely enough gain is
74 # marginal, which is why code remains disabled;
75
76 $flavour = shift;
77 $output = shift;
78 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
79
80 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
81
82 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
84 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
85 die "can't locate x86_64-xlate.pl";
86
87 open OUT,"| $^X $xlate $flavour $output";
88 *STDOUT=*OUT;
89
90 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
91 {
92 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
93
94 $code.=<<___;
95 .text
96
97 .globl rsaz_512_sqr
98 .type rsaz_512_sqr,\@function,4
99 .align 32
100 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
101 push %rbx
102 push %rbp
103 push %r12
104 push %r13
105 push %r14
106 push %r15
107
108 subq \$128+24, %rsp
109 .Lsqr_body:
110 movq $mod, %rbp # common argument
111 movq ($inp), %rdx
112 movq 8($inp), %rax
113 movq $n0, 128(%rsp)
114 jmp .Loop_sqr
115
116 .align 32
117 .Loop_sqr:
118 movl $times,128+8(%rsp)
119 ___
120 if (1) {
121 $code.=<<___;
122 #first iteration
123 movq %rdx, %rbx
124 mulq %rdx
125 movq %rax, %r8
126 movq 16($inp), %rax
127 movq %rdx, %r9
128
129 mulq %rbx
130 addq %rax, %r9
131 movq 24($inp), %rax
132 movq %rdx, %r10
133 adcq \$0, %r10
134
135 mulq %rbx
136 addq %rax, %r10
137 movq 32($inp), %rax
138 movq %rdx, %r11
139 adcq \$0, %r11
140
141 mulq %rbx
142 addq %rax, %r11
143 movq 40($inp), %rax
144 movq %rdx, %r12
145 adcq \$0, %r12
146
147 mulq %rbx
148 addq %rax, %r12
149 movq 48($inp), %rax
150 movq %rdx, %r13
151 adcq \$0, %r13
152
153 mulq %rbx
154 addq %rax, %r13
155 movq 56($inp), %rax
156 movq %rdx, %r14
157 adcq \$0, %r14
158
159 mulq %rbx
160 addq %rax, %r14
161 movq %rbx, %rax
162 movq %rdx, %r15
163 adcq \$0, %r15
164
165 addq %r8, %r8 #shlq \$1, %r8
166 movq %r9, %rcx
167 adcq %r9, %r9 #shld \$1, %r8, %r9
168
169 mulq %rax
170 movq %rax, (%rsp)
171 addq %rdx, %r8
172 adcq \$0, %r9
173
174 movq %r8, 8(%rsp)
175 shrq \$63, %rcx
176
177 #second iteration
178 movq 8($inp), %r8
179 movq 16($inp), %rax
180 mulq %r8
181 addq %rax, %r10
182 movq 24($inp), %rax
183 movq %rdx, %rbx
184 adcq \$0, %rbx
185
186 mulq %r8
187 addq %rax, %r11
188 movq 32($inp), %rax
189 adcq \$0, %rdx
190 addq %rbx, %r11
191 movq %rdx, %rbx
192 adcq \$0, %rbx
193
194 mulq %r8
195 addq %rax, %r12
196 movq 40($inp), %rax
197 adcq \$0, %rdx
198 addq %rbx, %r12
199 movq %rdx, %rbx
200 adcq \$0, %rbx
201
202 mulq %r8
203 addq %rax, %r13
204 movq 48($inp), %rax
205 adcq \$0, %rdx
206 addq %rbx, %r13
207 movq %rdx, %rbx
208 adcq \$0, %rbx
209
210 mulq %r8
211 addq %rax, %r14
212 movq 56($inp), %rax
213 adcq \$0, %rdx
214 addq %rbx, %r14
215 movq %rdx, %rbx
216 adcq \$0, %rbx
217
218 mulq %r8
219 addq %rax, %r15
220 movq %r8, %rax
221 adcq \$0, %rdx
222 addq %rbx, %r15
223 movq %rdx, %r8
224 movq %r10, %rdx
225 adcq \$0, %r8
226
227 add %rdx, %rdx
228 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
229 movq %r11, %rbx
230 adcq %r11, %r11 #shld \$1, %r10, %r11
231
232 mulq %rax
233 addq %rax, %r9
234 adcq %rdx, %r10
235 adcq \$0, %r11
236
237 movq %r9, 16(%rsp)
238 movq %r10, 24(%rsp)
239 shrq \$63, %rbx
240
241 #third iteration
242 movq 16($inp), %r9
243 movq 24($inp), %rax
244 mulq %r9
245 addq %rax, %r12
246 movq 32($inp), %rax
247 movq %rdx, %rcx
248 adcq \$0, %rcx
249
250 mulq %r9
251 addq %rax, %r13
252 movq 40($inp), %rax
253 adcq \$0, %rdx
254 addq %rcx, %r13
255 movq %rdx, %rcx
256 adcq \$0, %rcx
257
258 mulq %r9
259 addq %rax, %r14
260 movq 48($inp), %rax
261 adcq \$0, %rdx
262 addq %rcx, %r14
263 movq %rdx, %rcx
264 adcq \$0, %rcx
265
266 mulq %r9
267 movq %r12, %r10
268 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
269 addq %rax, %r15
270 movq 56($inp), %rax
271 adcq \$0, %rdx
272 addq %rcx, %r15
273 movq %rdx, %rcx
274 adcq \$0, %rcx
275
276 mulq %r9
277 shrq \$63, %r10
278 addq %rax, %r8
279 movq %r9, %rax
280 adcq \$0, %rdx
281 addq %rcx, %r8
282 movq %rdx, %r9
283 adcq \$0, %r9
284
285 movq %r13, %rcx
286 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
287
288 mulq %rax
289 addq %rax, %r11
290 adcq %rdx, %r12
291 adcq \$0, %r13
292
293 movq %r11, 32(%rsp)
294 movq %r12, 40(%rsp)
295 shrq \$63, %rcx
296
297 #fourth iteration
298 movq 24($inp), %r10
299 movq 32($inp), %rax
300 mulq %r10
301 addq %rax, %r14
302 movq 40($inp), %rax
303 movq %rdx, %rbx
304 adcq \$0, %rbx
305
306 mulq %r10
307 addq %rax, %r15
308 movq 48($inp), %rax
309 adcq \$0, %rdx
310 addq %rbx, %r15
311 movq %rdx, %rbx
312 adcq \$0, %rbx
313
314 mulq %r10
315 movq %r14, %r12
316 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
317 addq %rax, %r8
318 movq 56($inp), %rax
319 adcq \$0, %rdx
320 addq %rbx, %r8
321 movq %rdx, %rbx
322 adcq \$0, %rbx
323
324 mulq %r10
325 shrq \$63, %r12
326 addq %rax, %r9
327 movq %r10, %rax
328 adcq \$0, %rdx
329 addq %rbx, %r9
330 movq %rdx, %r10
331 adcq \$0, %r10
332
333 movq %r15, %rbx
334 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
335
336 mulq %rax
337 addq %rax, %r13
338 adcq %rdx, %r14
339 adcq \$0, %r15
340
341 movq %r13, 48(%rsp)
342 movq %r14, 56(%rsp)
343 shrq \$63, %rbx
344
345 #fifth iteration
346 movq 32($inp), %r11
347 movq 40($inp), %rax
348 mulq %r11
349 addq %rax, %r8
350 movq 48($inp), %rax
351 movq %rdx, %rcx
352 adcq \$0, %rcx
353
354 mulq %r11
355 addq %rax, %r9
356 movq 56($inp), %rax
357 adcq \$0, %rdx
358 movq %r8, %r12
359 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
360 addq %rcx, %r9
361 movq %rdx, %rcx
362 adcq \$0, %rcx
363
364 mulq %r11
365 shrq \$63, %r12
366 addq %rax, %r10
367 movq %r11, %rax
368 adcq \$0, %rdx
369 addq %rcx, %r10
370 movq %rdx, %r11
371 adcq \$0, %r11
372
373 movq %r9, %rcx
374 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
375
376 mulq %rax
377 addq %rax, %r15
378 adcq %rdx, %r8
379 adcq \$0, %r9
380
381 movq %r15, 64(%rsp)
382 movq %r8, 72(%rsp)
383 shrq \$63, %rcx
384
385 #sixth iteration
386 movq 40($inp), %r12
387 movq 48($inp), %rax
388 mulq %r12
389 addq %rax, %r10
390 movq 56($inp), %rax
391 movq %rdx, %rbx
392 adcq \$0, %rbx
393
394 mulq %r12
395 addq %rax, %r11
396 movq %r12, %rax
397 movq %r10, %r15
398 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
399 adcq \$0, %rdx
400 shrq \$63, %r15
401 addq %rbx, %r11
402 movq %rdx, %r12
403 adcq \$0, %r12
404
405 movq %r11, %rbx
406 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
407
408 mulq %rax
409 addq %rax, %r9
410 adcq %rdx, %r10
411 adcq \$0, %r11
412
413 movq %r9, 80(%rsp)
414 movq %r10, 88(%rsp)
415
416 #seventh iteration
417 movq 48($inp), %r13
418 movq 56($inp), %rax
419 mulq %r13
420 addq %rax, %r12
421 movq %r13, %rax
422 movq %rdx, %r13
423 adcq \$0, %r13
424
425 xorq %r14, %r14
426 shlq \$1, %rbx
427 adcq %r12, %r12 #shld \$1, %rbx, %r12
428 adcq %r13, %r13 #shld \$1, %r12, %r13
429 adcq %r14, %r14 #shld \$1, %r13, %r14
430
431 mulq %rax
432 addq %rax, %r11
433 adcq %rdx, %r12
434 adcq \$0, %r13
435
436 movq %r11, 96(%rsp)
437 movq %r12, 104(%rsp)
438
439 #eighth iteration
440 movq 56($inp), %rax
441 mulq %rax
442 addq %rax, %r13
443 adcq \$0, %rdx
444
445 addq %rdx, %r14
446
447 movq %r13, 112(%rsp)
448 movq %r14, 120(%rsp)
449 ___
450 } else {
451 $code.=<<___;
452 movq $out, %xmm0 # off-load
453 #first iteration
454 mulx %rax, %r8, %r9
455
456 mulx 16($inp), %rcx, %r10
457
458 mulx 24($inp), %rax, %r11
459 add %rcx, %r9
460
461 mulx 32($inp), %rcx, %r12
462 adc %rax, %r10
463
464 mulx 40($inp), %rax, %r13
465 adc %rcx, %r11
466
467 mulx 48($inp), %rcx, %r14
468 adc %rax, %r12
469
470 mulx 56($inp), %rax, %r15
471 adc %rcx, %r13
472 mov %r9, %rcx
473 adc %rax, %r14
474 adc \$0, %r15
475
476 shld \$1, %r8, %r9
477 shl \$1, %r8
478
479 mulx %rdx, %rax, %rdx
480 add %rdx, %r8
481 adc \$0, %r9
482
483 mov %rax, (%rsp)
484 mov %r8, 8(%rsp)
485
486 #second iteration
487 mov 8($inp), %rdx
488 mulx 16($inp), %rax, %rbx
489
490 mulx 24($inp), $out, %r8
491 add %rax, %r10
492 adc %rbx, %r11
493 adc \$0, %r8
494
495 mulx 32($inp), %rax, %rbx
496 add $out, %r11
497 adc %r8, %r12
498 adc \$0, %rbx
499
500 mulx 40($inp), $out, %r8
501 add %rax, %r12
502 adc %rbx, %r13
503 adc \$0, %r8
504
505 mulx 48($inp), %rax, %rbx
506 add $out, %r13
507 adc %r8, %r14
508 adc \$0, %rbx
509
510 mulx 56($inp), $out, %r8
511 add %rax, %r14
512 adc %rbx, %r15
513 mov %r11, %rbx
514 adc \$0, %r8
515 add $out, %r15
516 adc \$0, %r8
517
518 shld \$1, %r10, %r11
519 shld \$1, %rcx, %r10
520
521 mulx %rdx, %rax, %rcx
522 add %rax, %r9
523 adc %rcx, %r10
524 adc \$0, %r11
525
526 mov %r9, 16(%rsp)
527 mov %r10, 24(%rsp)
528
529 #third iteration
530 mov 16($inp), %rdx
531 mulx 24($inp), $out, %r9
532
533 mulx 32($inp), %rax, %rcx
534 add $out, %r12
535 adc %r9, %r13
536 adc \$0, %rcx
537
538 mulx 40($inp), $out, %r9
539 add %rax, %r13
540 adc %rcx, %r14
541 adc \$0, %r9
542
543 mulx 48($inp), %rax, %rcx
544 add $out, %r14
545 adc %r9, %r15
546 adc \$0, %rcx
547
548 mulx 56($inp), $out, %r9
549 add %rax, %r15
550 adc %rcx, %r8
551 mov %r13, %rcx
552 adc \$0, %r9
553 add $out, %r8
554 adc \$0, %r9
555
556 shld \$1, %r12, %r13
557 shld \$1, %rbx, %r12
558
559 mulx %rdx, %rax, %rdx
560 add %rax, %r11
561 adc %rdx, %r12
562 adc \$0, %r13
563
564 mov %r11, 32(%rsp)
565 mov %r12, 40(%rsp)
566
567 #fourth iteration
568 mov 24($inp), %rdx
569 mulx 32($inp), %rax, %rbx
570
571 mulx 40($inp), $out, %r10
572 add %rax, %r14
573 adc %rbx, %r15
574 adc \$0, %r10
575
576 mulx 48($inp), %rax, %rbx
577 add $out, %r15
578 adc %r10, %r8
579 adc \$0, %rbx
580
581 mulx 56($inp), $out, %r10
582 add %rax, %r8
583 adc \$0, %rbx
584 add $out, %r9
585 adc \$0, %r10
586 add %rbx, %r9
587 mov %r15, %rbx
588 adc \$0, %r10
589
590 shld \$1, %r14, %r15
591 shld \$1, %rcx, %r14
592
593 mulx %rdx, %rax, %rdx
594 add %rax, %r13
595 adc %rdx, %r14
596 adc \$0, %r15
597
598 mov %r13, 48(%rsp)
599 mov %r14, 56(%rsp)
600
601 #fifth iteration
602 mov 32($inp), %rdx
603 mulx 40($inp), $out, %r11
604
605 mulx 48($inp), %rax, %rcx
606 add $out, %r8
607 adc %r11, %r9
608 adc \$0, %rcx
609
610 mulx 56($inp), $out, %r11
611 add %rax, %r9
612 adc %rcx, %r10
613 adc \$0, %r11
614 add $out, %r10
615 adc \$0, %r11
616
617 mov %r9, %rcx
618 shld \$1, %r8, %r9
619 shld \$1, %rbx, %r8
620
621 mulx %rdx, %rax, %rdx
622 add %rax, %r15
623 adc %rdx, %r8
624 adc \$0, %r9
625
626 mov %r15, 64(%rsp)
627 mov %r8, 72(%rsp)
628
629 #sixth iteration
630 mov 40($inp), %rdx
631 mulx 48($inp), %rax, %rbx
632
633 mulx 56($inp), $out, %r12
634 add %rax, %r10
635 adc %rbx, %r11
636 adc \$0, %r12
637 add $out, %r11
638 adc \$0, %r12
639
640 mov %r11, %rbx
641 shld \$1, %r10, %r11
642 shld \$1, %rcx, %r10
643
644 mulx %rdx, %rax, %rdx
645 add %rax, %r9
646 adc %rdx, %r10
647 adc \$0, %r11
648
649 mov %r9, 80(%rsp)
650 mov %r10, 88(%rsp)
651
652 #seventh iteration
653 mov 48($inp), %rdx
654 mulx 56($inp), %rax, %r13
655 add %rax, %r12
656 adc \$0, %r13
657
658 xor %r14, %r14
659 shld \$1, %r13, %r14
660 shld \$1, %r12, %r13
661 shld \$1, %rbx, %r12
662
663 mulx %rdx, %rax, %rdx
664 add %rax, %r11
665 adc %rdx, %r12
666 adc \$0, %r13
667
668 mov %r11, 96(%rsp)
669 mov %r12, 104(%rsp)
670
671 #eighth iteration
672 mov 56($inp), %rdx
673 mulx %rdx, %rax, %rdx
674 add %rax, %r13
675 adc \$0, %rdx
676
677 add %rdx, %r14
678
679 movq %r13, 112(%rsp)
680 movq %r14, 120(%rsp)
681 movq %xmm0, $out
682 ___
683 }
684 $code.=<<___;
685 movq (%rsp), %r8
686 movq 8(%rsp), %r9
687 movq 16(%rsp), %r10
688 movq 24(%rsp), %r11
689 movq 32(%rsp), %r12
690 movq 40(%rsp), %r13
691 movq 48(%rsp), %r14
692 movq 56(%rsp), %r15
693
694 call _rsaz_512_reduce
695
696 addq 64(%rsp), %r8
697 adcq 72(%rsp), %r9
698 adcq 80(%rsp), %r10
699 adcq 88(%rsp), %r11
700 adcq 96(%rsp), %r12
701 adcq 104(%rsp), %r13
702 adcq 112(%rsp), %r14
703 adcq 120(%rsp), %r15
704 sbbq %rcx, %rcx
705
706 call _rsaz_512_subtract
707
708 movq %r8, %rdx
709 movq %r9, %rax
710 movl 128+8(%rsp), $times
711 movq $out, $inp
712
713 decl $times
714 jnz .Loop_sqr
715
716 leaq 128+24+48(%rsp), %rax
717 movq -48(%rax), %r15
718 movq -40(%rax), %r14
719 movq -32(%rax), %r13
720 movq -24(%rax), %r12
721 movq -16(%rax), %rbp
722 movq -8(%rax), %rbx
723 leaq (%rax), %rsp
724 .Lsqr_epilogue:
725 ret
726 .size rsaz_512_sqr,.-rsaz_512_sqr
727 ___
728 }
729 {
730 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
731 $code.=<<___;
732 .globl rsaz_512_mul
733 .type rsaz_512_mul,\@function,5
734 .align 32
735 rsaz_512_mul:
736 push %rbx
737 push %rbp
738 push %r12
739 push %r13
740 push %r14
741 push %r15
742
743 subq \$128+24, %rsp
744 .Lmul_body:
745 movq $out, %xmm0 # off-load arguments
746 movq $mod, %xmm1
747 movq $n0, 128(%rsp)
748
749 movq $bp, %rbp # pass argument
750 call __rsaz_512_mul
751
752 movq %xmm0, $out
753 movq %xmm1, %rbp
754
755 movq (%rsp), %r8
756 movq 8(%rsp), %r9
757 movq 16(%rsp), %r10
758 movq 24(%rsp), %r11
759 movq 32(%rsp), %r12
760 movq 40(%rsp), %r13
761 movq 48(%rsp), %r14
762 movq 56(%rsp), %r15
763
764 call _rsaz_512_reduce
765
766 addq 64(%rsp), %r8
767 adcq 72(%rsp), %r9
768 adcq 80(%rsp), %r10
769 adcq 88(%rsp), %r11
770 adcq 96(%rsp), %r12
771 adcq 104(%rsp), %r13
772 adcq 112(%rsp), %r14
773 adcq 120(%rsp), %r15
774 sbbq %rcx, %rcx
775
776 call _rsaz_512_subtract
777
778 leaq 128+24+48(%rsp), %rax
779 movq -48(%rax), %r15
780 movq -40(%rax), %r14
781 movq -32(%rax), %r13
782 movq -24(%rax), %r12
783 movq -16(%rax), %rbp
784 movq -8(%rax), %rbx
785 leaq (%rax), %rsp
786 .Lmul_epilogue:
787 ret
788 .size rsaz_512_mul,.-rsaz_512_mul
789 ___
790 }
791 {
792 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
793 $code.=<<___;
794 .globl rsaz_512_mul_gather4
795 .type rsaz_512_mul_gather4,\@function,6
796 .align 32
797 rsaz_512_mul_gather4:
798 push %rbx
799 push %rbp
800 push %r12
801 push %r13
802 push %r14
803 push %r15
804
805 subq \$128+24, %rsp
806 .Lmul_gather4_body:
807 movl 64($bp,$pwr,4), %eax
808 movq $out, %xmm0 # off-load arguments
809 movl ($bp,$pwr,4), %ebx
810 movq $mod, %xmm1
811 movq $n0, 128(%rsp)
812
813 shlq \$32, %rax
814 or %rax, %rbx
815 movq ($ap), %rax
816 movq 8($ap), %rcx
817 leaq 128($bp,$pwr,4), %rbp
818 mulq %rbx # 0 iteration
819 movq %rax, (%rsp)
820 movq %rcx, %rax
821 movq %rdx, %r8
822
823 mulq %rbx
824 movd (%rbp), %xmm4
825 addq %rax, %r8
826 movq 16($ap), %rax
827 movq %rdx, %r9
828 adcq \$0, %r9
829
830 mulq %rbx
831 movd 64(%rbp), %xmm5
832 addq %rax, %r9
833 movq 24($ap), %rax
834 movq %rdx, %r10
835 adcq \$0, %r10
836
837 mulq %rbx
838 pslldq \$4, %xmm5
839 addq %rax, %r10
840 movq 32($ap), %rax
841 movq %rdx, %r11
842 adcq \$0, %r11
843
844 mulq %rbx
845 por %xmm5, %xmm4
846 addq %rax, %r11
847 movq 40($ap), %rax
848 movq %rdx, %r12
849 adcq \$0, %r12
850
851 mulq %rbx
852 addq %rax, %r12
853 movq 48($ap), %rax
854 movq %rdx, %r13
855 adcq \$0, %r13
856
857 mulq %rbx
858 leaq 128(%rbp), %rbp
859 addq %rax, %r13
860 movq 56($ap), %rax
861 movq %rdx, %r14
862 adcq \$0, %r14
863
864 mulq %rbx
865 movq %xmm4, %rbx
866 addq %rax, %r14
867 movq ($ap), %rax
868 movq %rdx, %r15
869 adcq \$0, %r15
870
871 leaq 8(%rsp), %rdi
872 movl \$7, %ecx
873 jmp .Loop_mul_gather
874
875 .align 32
876 .Loop_mul_gather:
877 mulq %rbx
878 addq %rax, %r8
879 movq 8($ap), %rax
880 movq %r8, (%rdi)
881 movq %rdx, %r8
882 adcq \$0, %r8
883
884 mulq %rbx
885 movd (%rbp), %xmm4
886 addq %rax, %r9
887 movq 16($ap), %rax
888 adcq \$0, %rdx
889 addq %r9, %r8
890 movq %rdx, %r9
891 adcq \$0, %r9
892
893 mulq %rbx
894 movd 64(%rbp), %xmm5
895 addq %rax, %r10
896 movq 24($ap), %rax
897 adcq \$0, %rdx
898 addq %r10, %r9
899 movq %rdx, %r10
900 adcq \$0, %r10
901
902 mulq %rbx
903 pslldq \$4, %xmm5
904 addq %rax, %r11
905 movq 32($ap), %rax
906 adcq \$0, %rdx
907 addq %r11, %r10
908 movq %rdx, %r11
909 adcq \$0, %r11
910
911 mulq %rbx
912 por %xmm5, %xmm4
913 addq %rax, %r12
914 movq 40($ap), %rax
915 adcq \$0, %rdx
916 addq %r12, %r11
917 movq %rdx, %r12
918 adcq \$0, %r12
919
920 mulq %rbx
921 addq %rax, %r13
922 movq 48($ap), %rax
923 adcq \$0, %rdx
924 addq %r13, %r12
925 movq %rdx, %r13
926 adcq \$0, %r13
927
928 mulq %rbx
929 addq %rax, %r14
930 movq 56($ap), %rax
931 adcq \$0, %rdx
932 addq %r14, %r13
933 movq %rdx, %r14
934 adcq \$0, %r14
935
936 mulq %rbx
937 movq %xmm4, %rbx
938 addq %rax, %r15
939 movq ($ap), %rax
940 adcq \$0, %rdx
941 addq %r15, %r14
942 movq %rdx, %r15
943 adcq \$0, %r15
944
945 leaq 128(%rbp), %rbp
946 leaq 8(%rdi), %rdi
947
948 decl %ecx
949 jnz .Loop_mul_gather
950
951 movq %r8, (%rdi)
952 movq %r9, 8(%rdi)
953 movq %r10, 16(%rdi)
954 movq %r11, 24(%rdi)
955 movq %r12, 32(%rdi)
956 movq %r13, 40(%rdi)
957 movq %r14, 48(%rdi)
958 movq %r15, 56(%rdi)
959
960 movq %xmm0, $out
961 movq %xmm1, %rbp
962
963 movq (%rsp), %r8
964 movq 8(%rsp), %r9
965 movq 16(%rsp), %r10
966 movq 24(%rsp), %r11
967 movq 32(%rsp), %r12
968 movq 40(%rsp), %r13
969 movq 48(%rsp), %r14
970 movq 56(%rsp), %r15
971
972 call _rsaz_512_reduce
973
974 addq 64(%rsp), %r8
975 adcq 72(%rsp), %r9
976 adcq 80(%rsp), %r10
977 adcq 88(%rsp), %r11
978 adcq 96(%rsp), %r12
979 adcq 104(%rsp), %r13
980 adcq 112(%rsp), %r14
981 adcq 120(%rsp), %r15
982 sbbq %rcx, %rcx
983
984 call _rsaz_512_subtract
985
986 leaq 128+24+48(%rsp), %rax
987 movq -48(%rax), %r15
988 movq -40(%rax), %r14
989 movq -32(%rax), %r13
990 movq -24(%rax), %r12
991 movq -16(%rax), %rbp
992 movq -8(%rax), %rbx
993 leaq (%rax), %rsp
994 .Lmul_gather4_epilogue:
995 ret
996 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
997 ___
998 }
999 {
1000 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1001 $code.=<<___;
1002 .globl rsaz_512_mul_scatter4
1003 .type rsaz_512_mul_scatter4,\@function,6
1004 .align 32
1005 rsaz_512_mul_scatter4:
1006 push %rbx
1007 push %rbp
1008 push %r12
1009 push %r13
1010 push %r14
1011 push %r15
1012
1013 subq \$128+24, %rsp
1014 .Lmul_scatter4_body:
1015 leaq ($tbl,$pwr,4), $tbl
1016 movq $out, %xmm0 # off-load arguments
1017 movq $mod, %xmm1
1018 movq $tbl, %xmm2
1019 movq $n0, 128(%rsp)
1020
1021 movq $out, %rbp
1022 call __rsaz_512_mul
1023
1024 movq %xmm0, $out
1025 movq %xmm1, %rbp
1026
1027 movq (%rsp), %r8
1028 movq 8(%rsp), %r9
1029 movq 16(%rsp), %r10
1030 movq 24(%rsp), %r11
1031 movq 32(%rsp), %r12
1032 movq 40(%rsp), %r13
1033 movq 48(%rsp), %r14
1034 movq 56(%rsp), %r15
1035
1036 call _rsaz_512_reduce
1037
1038 addq 64(%rsp), %r8
1039 adcq 72(%rsp), %r9
1040 adcq 80(%rsp), %r10
1041 adcq 88(%rsp), %r11
1042 adcq 96(%rsp), %r12
1043 adcq 104(%rsp), %r13
1044 adcq 112(%rsp), %r14
1045 adcq 120(%rsp), %r15
1046 movq %xmm2, $inp
1047 sbbq %rcx, %rcx
1048
1049 call _rsaz_512_subtract
1050
1051 movl %r8d, 64*0($inp) # scatter
1052 shrq \$32, %r8
1053 movl %r9d, 64*2($inp)
1054 shrq \$32, %r9
1055 movl %r10d, 64*4($inp)
1056 shrq \$32, %r10
1057 movl %r11d, 64*6($inp)
1058 shrq \$32, %r11
1059 movl %r12d, 64*8($inp)
1060 shrq \$32, %r12
1061 movl %r13d, 64*10($inp)
1062 shrq \$32, %r13
1063 movl %r14d, 64*12($inp)
1064 shrq \$32, %r14
1065 movl %r15d, 64*14($inp)
1066 shrq \$32, %r15
1067 movl %r8d, 64*1($inp)
1068 movl %r9d, 64*3($inp)
1069 movl %r10d, 64*5($inp)
1070 movl %r11d, 64*7($inp)
1071 movl %r12d, 64*9($inp)
1072 movl %r13d, 64*11($inp)
1073 movl %r14d, 64*13($inp)
1074 movl %r15d, 64*15($inp)
1075
1076 leaq 128+24+48(%rsp), %rax
1077 movq -48(%rax), %r15
1078 movq -40(%rax), %r14
1079 movq -32(%rax), %r13
1080 movq -24(%rax), %r12
1081 movq -16(%rax), %rbp
1082 movq -8(%rax), %rbx
1083 leaq (%rax), %rsp
1084 .Lmul_scatter4_epilogue:
1085 ret
1086 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1087 ___
1088 }
1089 {
1090 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1091 $code.=<<___;
1092 .globl rsaz_512_mul_by_one
1093 .type rsaz_512_mul_by_one,\@function,4
1094 .align 32
1095 rsaz_512_mul_by_one:
1096 push %rbx
1097 push %rbp
1098 push %r12
1099 push %r13
1100 push %r14
1101 push %r15
1102
1103 subq \$128+24, %rsp
1104 .Lmul_by_one_body:
1105 movq $mod, %rbp # reassign argument
1106 movq $n0, 128(%rsp)
1107
1108 movq ($inp), %r8
1109 pxor %xmm0, %xmm0
1110 movq 8($inp), %r9
1111 movq 16($inp), %r10
1112 movq 24($inp), %r11
1113 movq 32($inp), %r12
1114 movq 40($inp), %r13
1115 movq 48($inp), %r14
1116 movq 56($inp), %r15
1117
1118 movdqa %xmm0, (%rsp)
1119 movdqa %xmm0, 16(%rsp)
1120 movdqa %xmm0, 32(%rsp)
1121 movdqa %xmm0, 48(%rsp)
1122 movdqa %xmm0, 64(%rsp)
1123 movdqa %xmm0, 80(%rsp)
1124 movdqa %xmm0, 96(%rsp)
1125
1126 call _rsaz_512_reduce
1127
1128 movq %r8, ($out)
1129 movq %r9, 8($out)
1130 movq %r10, 16($out)
1131 movq %r11, 24($out)
1132 movq %r12, 32($out)
1133 movq %r13, 40($out)
1134 movq %r14, 48($out)
1135 movq %r15, 56($out)
1136
1137 leaq 128+24+48(%rsp), %rax
1138 movq -48(%rax), %r15
1139 movq -40(%rax), %r14
1140 movq -32(%rax), %r13
1141 movq -24(%rax), %r12
1142 movq -16(%rax), %rbp
1143 movq -8(%rax), %rbx
1144 leaq (%rax), %rsp
1145 .Lmul_by_one_epilogue:
1146 ret
1147 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1148 ___
1149 }
1150 { # _rsaz_512_reduce
1151 #
1152 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1153 # output: %r8-%r15
1154 # clobbers: everything except %rbp and %rdi
1155 $code.=<<___;
1156 .type _rsaz_512_reduce,\@abi-omnipotent
1157 .align 32
1158 _rsaz_512_reduce:
1159 ___
1160 if (1) {
1161 $code.=<<___;
1162 movq %r8, %rbx
1163 imulq 128+8(%rsp), %rbx
1164 movq 0(%rbp), %rax
1165 movl \$8, %ecx
1166 jmp .Lreduction_loop
1167
1168 .align 32
1169 .Lreduction_loop:
1170 mulq %rbx
1171 movq 8(%rbp), %rax
1172 negq %r8
1173 movq %rdx, %r8
1174 adcq \$0, %r8
1175
1176 mulq %rbx
1177 addq %rax, %r9
1178 movq 16(%rbp), %rax
1179 adcq \$0, %rdx
1180 addq %r9, %r8
1181 movq %rdx, %r9
1182 adcq \$0, %r9
1183
1184 mulq %rbx
1185 addq %rax, %r10
1186 movq 24(%rbp), %rax
1187 adcq \$0, %rdx
1188 addq %r10, %r9
1189 movq %rdx, %r10
1190 adcq \$0, %r10
1191
1192 mulq %rbx
1193 addq %rax, %r11
1194 movq 32(%rbp), %rax
1195 adcq \$0, %rdx
1196 addq %r11, %r10
1197 movq 128+8(%rsp), %rsi
1198 movq %rdx, %r11
1199 adcq \$0, %r11
1200
1201 mulq %rbx
1202 addq %rax, %r12
1203 movq 40(%rbp), %rax
1204 adcq \$0, %rdx
1205 imulq %r8, %rsi
1206 addq %r12, %r11
1207 movq %rdx, %r12
1208 adcq \$0, %r12
1209
1210 mulq %rbx
1211 addq %rax, %r13
1212 movq 48(%rbp), %rax
1213 adcq \$0, %rdx
1214 addq %r13, %r12
1215 movq %rdx, %r13
1216 adcq \$0, %r13
1217
1218 mulq %rbx
1219 addq %rax, %r14
1220 movq 56(%rbp), %rax
1221 adcq \$0, %rdx
1222 addq %r14, %r13
1223 movq %rdx, %r14
1224 adcq \$0, %r14
1225
1226 mulq %rbx
1227 movq %rsi, %rbx
1228 addq %rax, %r15
1229 movq 0(%rbp), %rax
1230 adcq \$0, %rdx
1231 addq %r15, %r14
1232 movq %rdx, %r15
1233 adcq \$0, %r15
1234
1235 decl %ecx
1236 jne .Lreduction_loop
1237 ___
1238 } else {
1239 $code.=<<___;
1240 movq 128+8(%rsp), %rdx # pull $n0
1241 imulq %r8, %rdx
1242 movl \$8, %ecx
1243 jmp .Lreduction_loop
1244
1245 .align 32
1246 .Lreduction_loop:
1247 neg %r8
1248 mulx 0(%rbp), %rax, %r8
1249 adc %r9, %r8
1250
1251 mulx 8(%rbp), %rax, %r9
1252 adc \$0, %r9
1253 add %rax, %r8
1254 adc %r10, %r9
1255
1256 mulx 16(%rbp), %rax, %r10
1257 adc \$0, %r10
1258 mov 128+8(%rsp), %rbx # pull $n0
1259 imul %r8, %rbx
1260 add %rax, %r9
1261 adc %r11, %r10
1262
1263 mulx 24(%rbp), %rax, %r11
1264 adc \$0, %r11
1265 add %rax, %r10
1266 adc %r12, %r11
1267
1268 mulx 32(%rbp), %rax, %r12
1269 adc \$0, %r12
1270 add %rax, %r11
1271 adc %r13, %r12
1272
1273 mulx 40(%rbp), %rax, %r13
1274 adc \$0, %r13
1275 add %rax, %r12
1276 adc %r14, %r13
1277
1278 mulx 48(%rbp), %rax, %r14
1279 adc \$0, %r14
1280 add %rax, %r13
1281 adc %r15, %r14
1282
1283 mulx 56(%rbp), %rax, %r15
1284 mov %rbx, %rdx
1285 adc \$0, %r15
1286 add %rax, %r14
1287 adc \$0, %r15
1288
1289 dec %ecx
1290 jne .Lreduction_loop
1291 ___
1292 }
1293 $code.=<<___;
1294 ret
1295 .size _rsaz_512_reduce,.-_rsaz_512_reduce
1296 ___
1297 }
1298 { # _rsaz_512_subtract
1299 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1300 # output:
1301 # clobbers: everything but %rdi, %rsi and %rbp
1302 $code.=<<___;
1303 .type _rsaz_512_subtract,\@abi-omnipotent
1304 .align 32
1305 _rsaz_512_subtract:
1306 movq %r8, ($out)
1307 movq %r9, 8($out)
1308 movq %r10, 16($out)
1309 movq %r11, 24($out)
1310 movq %r12, 32($out)
1311 movq %r13, 40($out)
1312 movq %r14, 48($out)
1313 movq %r15, 56($out)
1314
1315 movq 0($mod), %r8
1316 movq 8($mod), %r9
1317 negq %r8
1318 notq %r9
1319 andq %rcx, %r8
1320 movq 16($mod), %r10
1321 andq %rcx, %r9
1322 notq %r10
1323 movq 24($mod), %r11
1324 andq %rcx, %r10
1325 notq %r11
1326 movq 32($mod), %r12
1327 andq %rcx, %r11
1328 notq %r12
1329 movq 40($mod), %r13
1330 andq %rcx, %r12
1331 notq %r13
1332 movq 48($mod), %r14
1333 andq %rcx, %r13
1334 notq %r14
1335 movq 56($mod), %r15
1336 andq %rcx, %r14
1337 notq %r15
1338 andq %rcx, %r15
1339
1340 addq ($out), %r8
1341 adcq 8($out), %r9
1342 adcq 16($out), %r10
1343 adcq 24($out), %r11
1344 adcq 32($out), %r12
1345 adcq 40($out), %r13
1346 adcq 48($out), %r14
1347 adcq 56($out), %r15
1348
1349 movq %r8, ($out)
1350 movq %r9, 8($out)
1351 movq %r10, 16($out)
1352 movq %r11, 24($out)
1353 movq %r12, 32($out)
1354 movq %r13, 40($out)
1355 movq %r14, 48($out)
1356 movq %r15, 56($out)
1357
1358 ret
1359 .size _rsaz_512_subtract,.-_rsaz_512_subtract
1360 ___
1361 }
1362 { # __rsaz_512_mul
1363 #
1364 # input: %rsi - ap, %rbp - bp
1365 # ouput:
1366 # clobbers: everything
1367 my ($ap,$bp) = ("%rsi","%rbp");
1368 $code.=<<___;
1369 .type __rsaz_512_mul,\@abi-omnipotent
1370 .align 32
1371 __rsaz_512_mul:
1372 leaq 8(%rsp), %rdi
1373
1374 movq ($bp), %rbx
1375 movq ($ap), %rax
1376 mulq %rbx
1377 movq %rax, (%rdi)
1378 movq 8($ap), %rax
1379 movq %rdx, %r8
1380
1381 mulq %rbx
1382 addq %rax, %r8
1383 movq 16($ap), %rax
1384 movq %rdx, %r9
1385 adcq \$0, %r9
1386
1387 mulq %rbx
1388 addq %rax, %r9
1389 movq 24($ap), %rax
1390 movq %rdx, %r10
1391 adcq \$0, %r10
1392
1393 mulq %rbx
1394 addq %rax, %r10
1395 movq 32($ap), %rax
1396 movq %rdx, %r11
1397 adcq \$0, %r11
1398
1399 mulq %rbx
1400 addq %rax, %r11
1401 movq 40($ap), %rax
1402 movq %rdx, %r12
1403 adcq \$0, %r12
1404
1405 mulq %rbx
1406 addq %rax, %r12
1407 movq 48($ap), %rax
1408 movq %rdx, %r13
1409 adcq \$0, %r13
1410
1411 mulq %rbx
1412 addq %rax, %r13
1413 movq 56($ap), %rax
1414 movq %rdx, %r14
1415 adcq \$0, %r14
1416
1417 mulq %rbx
1418 addq %rax, %r14
1419 movq ($ap), %rax
1420 movq %rdx, %r15
1421 adcq \$0, %r15
1422
1423 leaq 8($bp), $bp
1424 leaq 8(%rdi), %rdi
1425
1426 movl \$7, %ecx
1427 jmp .Loop_mul
1428
1429 .align 32
1430 .Loop_mul:
1431 movq ($bp), %rbx
1432 mulq %rbx
1433 addq %rax, %r8
1434 movq 8($ap), %rax
1435 movq %r8, (%rdi)
1436 movq %rdx, %r8
1437 adcq \$0, %r8
1438
1439 mulq %rbx
1440 addq %rax, %r9
1441 movq 16($ap), %rax
1442 adcq \$0, %rdx
1443 addq %r9, %r8
1444 movq %rdx, %r9
1445 adcq \$0, %r9
1446
1447 mulq %rbx
1448 addq %rax, %r10
1449 movq 24($ap), %rax
1450 adcq \$0, %rdx
1451 addq %r10, %r9
1452 movq %rdx, %r10
1453 adcq \$0, %r10
1454
1455 mulq %rbx
1456 addq %rax, %r11
1457 movq 32($ap), %rax
1458 adcq \$0, %rdx
1459 addq %r11, %r10
1460 movq %rdx, %r11
1461 adcq \$0, %r11
1462
1463 mulq %rbx
1464 addq %rax, %r12
1465 movq 40($ap), %rax
1466 adcq \$0, %rdx
1467 addq %r12, %r11
1468 movq %rdx, %r12
1469 adcq \$0, %r12
1470
1471 mulq %rbx
1472 addq %rax, %r13
1473 movq 48($ap), %rax
1474 adcq \$0, %rdx
1475 addq %r13, %r12
1476 movq %rdx, %r13
1477 adcq \$0, %r13
1478
1479 mulq %rbx
1480 addq %rax, %r14
1481 movq 56($ap), %rax
1482 adcq \$0, %rdx
1483 addq %r14, %r13
1484 movq %rdx, %r14
1485 leaq 8($bp), $bp
1486 adcq \$0, %r14
1487
1488 mulq %rbx
1489 addq %rax, %r15
1490 movq ($ap), %rax
1491 adcq \$0, %rdx
1492 addq %r15, %r14
1493 movq %rdx, %r15
1494 adcq \$0, %r15
1495
1496 leaq 8(%rdi), %rdi
1497
1498 decl %ecx
1499 jnz .Loop_mul
1500
1501 movq %r8, (%rdi)
1502 movq %r9, 8(%rdi)
1503 movq %r10, 16(%rdi)
1504 movq %r11, 24(%rdi)
1505 movq %r12, 32(%rdi)
1506 movq %r13, 40(%rdi)
1507 movq %r14, 48(%rdi)
1508 movq %r15, 56(%rdi)
1509
1510 ret
1511 .size __rsaz_512_mul,.-__rsaz_512_mul
1512 ___
1513 }
1514 {
1515 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1516 $code.=<<___;
1517 .globl rsaz_512_scatter4
1518 .type rsaz_512_scatter4,\@abi-omnipotent
1519 .align 16
1520 rsaz_512_scatter4:
1521 leaq ($out,$power,4), $out
1522 movl \$8, %r9d
1523 jmp .Loop_scatter
1524 .align 16
1525 .Loop_scatter:
1526 movq ($inp), %rax
1527 leaq 8($inp), $inp
1528 movl %eax, ($out)
1529 shrq \$32, %rax
1530 movl %eax, 64($out)
1531 leaq 128($out), $out
1532 decl %r9d
1533 jnz .Loop_scatter
1534 ret
1535 .size rsaz_512_scatter4,.-rsaz_512_scatter4
1536
1537 .globl rsaz_512_gather4
1538 .type rsaz_512_gather4,\@abi-omnipotent
1539 .align 16
1540 rsaz_512_gather4:
1541 leaq ($inp,$power,4), $inp
1542 movl \$8, %r9d
1543 jmp .Loop_gather
1544 .align 16
1545 .Loop_gather:
1546 movl ($inp), %eax
1547 movl 64($inp), %r8d
1548 leaq 128($inp), $inp
1549 shlq \$32, %r8
1550 or %r8, %rax
1551 movq %rax, ($out)
1552 leaq 8($out), $out
1553 decl %r9d
1554 jnz .Loop_gather
1555 ret
1556 .size rsaz_512_gather4,.-rsaz_512_gather4
1557 ___
1558 }
1559
1560 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1561 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1562 if ($win64) {
1563 $rec="%rcx";
1564 $frame="%rdx";
1565 $context="%r8";
1566 $disp="%r9";
1567
1568 $code.=<<___;
1569 .extern __imp_RtlVirtualUnwind
1570 .type se_handler,\@abi-omnipotent
1571 .align 16
1572 se_handler:
1573 push %rsi
1574 push %rdi
1575 push %rbx
1576 push %rbp
1577 push %r12
1578 push %r13
1579 push %r14
1580 push %r15
1581 pushfq
1582 sub \$64,%rsp
1583
1584 mov 120($context),%rax # pull context->Rax
1585 mov 248($context),%rbx # pull context->Rip
1586
1587 mov 8($disp),%rsi # disp->ImageBase
1588 mov 56($disp),%r11 # disp->HandlerData
1589
1590 mov 0(%r11),%r10d # HandlerData[0]
1591 lea (%rsi,%r10),%r10 # end of prologue label
1592 cmp %r10,%rbx # context->Rip<end of prologue label
1593 jb .Lcommon_seh_tail
1594
1595 mov 152($context),%rax # pull context->Rsp
1596
1597 mov 4(%r11),%r10d # HandlerData[1]
1598 lea (%rsi,%r10),%r10 # epilogue label
1599 cmp %r10,%rbx # context->Rip>=epilogue label
1600 jae .Lcommon_seh_tail
1601
1602 lea 128+24+48(%rax),%rax
1603
1604 mov -8(%rax),%rbx
1605 mov -16(%rax),%rbp
1606 mov -24(%rax),%r12
1607 mov -32(%rax),%r13
1608 mov -40(%rax),%r14
1609 mov -48(%rax),%r15
1610 mov %rbx,144($context) # restore context->Rbx
1611 mov %rbp,160($context) # restore context->Rbp
1612 mov %r12,216($context) # restore context->R12
1613 mov %r13,224($context) # restore context->R13
1614 mov %r14,232($context) # restore context->R14
1615 mov %r15,240($context) # restore context->R15
1616
1617 .Lcommon_seh_tail:
1618 mov 8(%rax),%rdi
1619 mov 16(%rax),%rsi
1620 mov %rax,152($context) # restore context->Rsp
1621 mov %rsi,168($context) # restore context->Rsi
1622 mov %rdi,176($context) # restore context->Rdi
1623
1624 mov 40($disp),%rdi # disp->ContextRecord
1625 mov $context,%rsi # context
1626 mov \$154,%ecx # sizeof(CONTEXT)
1627 .long 0xa548f3fc # cld; rep movsq
1628
1629 mov $disp,%rsi
1630 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1631 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1632 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1633 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1634 mov 40(%rsi),%r10 # disp->ContextRecord
1635 lea 56(%rsi),%r11 # &disp->HandlerData
1636 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1637 mov %r10,32(%rsp) # arg5
1638 mov %r11,40(%rsp) # arg6
1639 mov %r12,48(%rsp) # arg7
1640 mov %rcx,56(%rsp) # arg8, (NULL)
1641 call *__imp_RtlVirtualUnwind(%rip)
1642
1643 mov \$1,%eax # ExceptionContinueSearch
1644 add \$64,%rsp
1645 popfq
1646 pop %r15
1647 pop %r14
1648 pop %r13
1649 pop %r12
1650 pop %rbp
1651 pop %rbx
1652 pop %rdi
1653 pop %rsi
1654 ret
1655 .size sqr_handler,.-sqr_handler
1656
1657 .section .pdata
1658 .align 4
1659 .rva .LSEH_begin_rsaz_512_sqr
1660 .rva .LSEH_end_rsaz_512_sqr
1661 .rva .LSEH_info_rsaz_512_sqr
1662
1663 .rva .LSEH_begin_rsaz_512_mul
1664 .rva .LSEH_end_rsaz_512_mul
1665 .rva .LSEH_info_rsaz_512_mul
1666
1667 .rva .LSEH_begin_rsaz_512_mul_gather4
1668 .rva .LSEH_end_rsaz_512_mul_gather4
1669 .rva .LSEH_info_rsaz_512_mul_gather4
1670
1671 .rva .LSEH_begin_rsaz_512_mul_scatter4
1672 .rva .LSEH_end_rsaz_512_mul_scatter4
1673 .rva .LSEH_info_rsaz_512_mul_scatter4
1674
1675 .rva .LSEH_begin_rsaz_512_mul_by_one
1676 .rva .LSEH_end_rsaz_512_mul_by_one
1677 .rva .LSEH_info_rsaz_512_mul_by_one
1678
1679 .section .xdata
1680 .align 8
1681 .LSEH_info_rsaz_512_sqr:
1682 .byte 9,0,0,0
1683 .rva se_handler
1684 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
1685 .LSEH_info_rsaz_512_mul:
1686 .byte 9,0,0,0
1687 .rva se_handler
1688 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
1689 .LSEH_info_rsaz_512_mul_gather4:
1690 .byte 9,0,0,0
1691 .rva se_handler
1692 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
1693 .LSEH_info_rsaz_512_mul_scatter4:
1694 .byte 9,0,0,0
1695 .rva se_handler
1696 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
1697 .LSEH_info_rsaz_512_mul_by_one:
1698 .byte 9,0,0,0
1699 .rva se_handler
1700 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
1701 ___
1702 }
1703
1704 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1705 print $code;
1706 close STDOUT;