]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/rsaz-x86_64.pl
bn/asm/*x86_64*.pl: correct assembler requirement for ad*x.
[thirdparty/openssl.git] / crypto / bn / asm / rsaz-x86_64.pl
1 #!/usr/bin/env perl
2
3 #******************************************************************************#
4 #* Copyright(c) 2012, Intel Corp. *#
5 #* Developers and authors: *#
6 #* Shay Gueron (1, 2), and Vlad Krasnov (1) *#
7 #* (1) Intel Architecture Group, Microprocessor and Chipset Development, *#
8 #* Israel Development Center, Haifa, Israel *#
9 #* (2) University of Haifa *#
10 #******************************************************************************#
11 #* This submission to OpenSSL is to be made available under the OpenSSL *#
12 #* license, and only to the OpenSSL project, in order to allow integration *#
13 #* into the publicly distributed code. ? *#
14 #* The use of this code, or portions of this code, or concepts embedded in *#
15 #* this code, or modification of this code and/or algorithm(s) in it, or the *#
16 #* use of this code for any other purpose than stated above, requires special *#
17 #* licensing. *#
18 #******************************************************************************#
19 #******************************************************************************#
20 #* DISCLAIMER: *#
21 #* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS *#
22 #* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *#
23 #* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *#
24 #* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT*#
25 #* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, *#
26 #* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF *#
27 #* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS *#
28 #* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN *#
29 #* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) *#
30 #* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *#
31 #* POSSIBILITY OF SUCH DAMAGE. *#
32 #******************************************************************************#
33 #* Reference: *#
34 #* [1] S. Gueron, "Efficient Software Implementations of Modular *#
35 #* Exponentiation", http://eprint.iacr.org/2011/239 *#
36 #* [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". *#
37 #* IEEE Proceedings of 9th International Conference on Information *#
38 #* Technology: New Generations (ITNG 2012), 821-823 (2012). *#
39 #* [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation*#
40 #* Journal of Cryptographic Engineering 2:31-43 (2012). *#
41 #* [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis *#
42 #* resistant 512-bit and 1024-bit modular exponentiation for optimizing *#
43 #* RSA1024 and RSA2048 on x86_64 platforms", *#
44 #* http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest*#
45 ################################################################################
46
47 # While original submission covers 512- and 1024-bit exponentiation,
48 # this module is limited to 512-bit version only (and as such
49 # accelerates RSA1024 sign). This is because improvement for longer
50 # keys is not high enough to justify the effort, highest measured
51 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
52 # for the moment of this writing!] Nor does this module implement
53 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
54 # to more modular mixture of C and assembly. And it's optimized even
55 # for processors other than Intel Core family (see table below for
56 # improvement coefficients).
57 # <appro@openssl.org>
58 #
59 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
60 # ----------------+---------------------------
61 # Opteron +13% |+5% +20%
62 # Bulldozer -0% |-1% +10%
63 # P4 +11% |+7% +8%
64 # Westmere +5% |+14% +17%
65 # Sandy Bridge +2% |+12% +29%
66 # Ivy Bridge +1% |+11% +35%
67 # Haswell(**) -0% |+12% +39%
68 # Atom +13% |+11% +4%
69 # VIA Nano +70% |+9% +25%
70 #
71 # (*) rsax engine and fips numbers are presented for reference
72 # purposes;
73 # (**) MULX was attempted, but found to give only marginal improvement;
74
75 $flavour = shift;
76 $output = shift;
77 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
78
79 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
80
81 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
82 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
83 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
84 die "can't locate x86_64-xlate.pl";
85
86 open OUT,"| $^X $xlate $flavour $output";
87 *STDOUT=*OUT;
88
89 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
90 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
91 $addx = ($1>=2.23);
92 }
93
94 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
95 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
96 $addx = ($1>=2.10);
97 }
98
99 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
100 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
101 $addx = ($1>=11);
102 }
103
104 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
105 {
106 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
107
108 $code.=<<___;
109 .text
110
111 .extern OPENSSL_ia32cap_P
112
113 .globl rsaz_512_sqr
114 .type rsaz_512_sqr,\@function,4
115 .align 32
116 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
117 push %rbx
118 push %rbp
119 push %r12
120 push %r13
121 push %r14
122 push %r15
123
124 subq \$128+24, %rsp
125 .Lsqr_body:
126 movq $mod, %rbp # common argument
127 movq ($inp), %rdx
128 movq 8($inp), %rax
129 movq $n0, 128(%rsp)
130 ___
131 $code.=<<___ if ($addx);
132 movl \$0x80100,%r11d
133 andl OPENSSL_ia32cap_P+8(%rip),%r11d
134 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
135 je .Loop_sqrx
136 ___
137 $code.=<<___;
138 jmp .Loop_sqr
139
140 .align 32
141 .Loop_sqr:
142 movl $times,128+8(%rsp)
143 #first iteration
144 movq %rdx, %rbx
145 mulq %rdx
146 movq %rax, %r8
147 movq 16($inp), %rax
148 movq %rdx, %r9
149
150 mulq %rbx
151 addq %rax, %r9
152 movq 24($inp), %rax
153 movq %rdx, %r10
154 adcq \$0, %r10
155
156 mulq %rbx
157 addq %rax, %r10
158 movq 32($inp), %rax
159 movq %rdx, %r11
160 adcq \$0, %r11
161
162 mulq %rbx
163 addq %rax, %r11
164 movq 40($inp), %rax
165 movq %rdx, %r12
166 adcq \$0, %r12
167
168 mulq %rbx
169 addq %rax, %r12
170 movq 48($inp), %rax
171 movq %rdx, %r13
172 adcq \$0, %r13
173
174 mulq %rbx
175 addq %rax, %r13
176 movq 56($inp), %rax
177 movq %rdx, %r14
178 adcq \$0, %r14
179
180 mulq %rbx
181 addq %rax, %r14
182 movq %rbx, %rax
183 movq %rdx, %r15
184 adcq \$0, %r15
185
186 addq %r8, %r8 #shlq \$1, %r8
187 movq %r9, %rcx
188 adcq %r9, %r9 #shld \$1, %r8, %r9
189
190 mulq %rax
191 movq %rax, (%rsp)
192 addq %rdx, %r8
193 adcq \$0, %r9
194
195 movq %r8, 8(%rsp)
196 shrq \$63, %rcx
197
198 #second iteration
199 movq 8($inp), %r8
200 movq 16($inp), %rax
201 mulq %r8
202 addq %rax, %r10
203 movq 24($inp), %rax
204 movq %rdx, %rbx
205 adcq \$0, %rbx
206
207 mulq %r8
208 addq %rax, %r11
209 movq 32($inp), %rax
210 adcq \$0, %rdx
211 addq %rbx, %r11
212 movq %rdx, %rbx
213 adcq \$0, %rbx
214
215 mulq %r8
216 addq %rax, %r12
217 movq 40($inp), %rax
218 adcq \$0, %rdx
219 addq %rbx, %r12
220 movq %rdx, %rbx
221 adcq \$0, %rbx
222
223 mulq %r8
224 addq %rax, %r13
225 movq 48($inp), %rax
226 adcq \$0, %rdx
227 addq %rbx, %r13
228 movq %rdx, %rbx
229 adcq \$0, %rbx
230
231 mulq %r8
232 addq %rax, %r14
233 movq 56($inp), %rax
234 adcq \$0, %rdx
235 addq %rbx, %r14
236 movq %rdx, %rbx
237 adcq \$0, %rbx
238
239 mulq %r8
240 addq %rax, %r15
241 movq %r8, %rax
242 adcq \$0, %rdx
243 addq %rbx, %r15
244 movq %rdx, %r8
245 movq %r10, %rdx
246 adcq \$0, %r8
247
248 add %rdx, %rdx
249 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
250 movq %r11, %rbx
251 adcq %r11, %r11 #shld \$1, %r10, %r11
252
253 mulq %rax
254 addq %rax, %r9
255 adcq %rdx, %r10
256 adcq \$0, %r11
257
258 movq %r9, 16(%rsp)
259 movq %r10, 24(%rsp)
260 shrq \$63, %rbx
261
262 #third iteration
263 movq 16($inp), %r9
264 movq 24($inp), %rax
265 mulq %r9
266 addq %rax, %r12
267 movq 32($inp), %rax
268 movq %rdx, %rcx
269 adcq \$0, %rcx
270
271 mulq %r9
272 addq %rax, %r13
273 movq 40($inp), %rax
274 adcq \$0, %rdx
275 addq %rcx, %r13
276 movq %rdx, %rcx
277 adcq \$0, %rcx
278
279 mulq %r9
280 addq %rax, %r14
281 movq 48($inp), %rax
282 adcq \$0, %rdx
283 addq %rcx, %r14
284 movq %rdx, %rcx
285 adcq \$0, %rcx
286
287 mulq %r9
288 movq %r12, %r10
289 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
290 addq %rax, %r15
291 movq 56($inp), %rax
292 adcq \$0, %rdx
293 addq %rcx, %r15
294 movq %rdx, %rcx
295 adcq \$0, %rcx
296
297 mulq %r9
298 shrq \$63, %r10
299 addq %rax, %r8
300 movq %r9, %rax
301 adcq \$0, %rdx
302 addq %rcx, %r8
303 movq %rdx, %r9
304 adcq \$0, %r9
305
306 movq %r13, %rcx
307 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
308
309 mulq %rax
310 addq %rax, %r11
311 adcq %rdx, %r12
312 adcq \$0, %r13
313
314 movq %r11, 32(%rsp)
315 movq %r12, 40(%rsp)
316 shrq \$63, %rcx
317
318 #fourth iteration
319 movq 24($inp), %r10
320 movq 32($inp), %rax
321 mulq %r10
322 addq %rax, %r14
323 movq 40($inp), %rax
324 movq %rdx, %rbx
325 adcq \$0, %rbx
326
327 mulq %r10
328 addq %rax, %r15
329 movq 48($inp), %rax
330 adcq \$0, %rdx
331 addq %rbx, %r15
332 movq %rdx, %rbx
333 adcq \$0, %rbx
334
335 mulq %r10
336 movq %r14, %r12
337 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
338 addq %rax, %r8
339 movq 56($inp), %rax
340 adcq \$0, %rdx
341 addq %rbx, %r8
342 movq %rdx, %rbx
343 adcq \$0, %rbx
344
345 mulq %r10
346 shrq \$63, %r12
347 addq %rax, %r9
348 movq %r10, %rax
349 adcq \$0, %rdx
350 addq %rbx, %r9
351 movq %rdx, %r10
352 adcq \$0, %r10
353
354 movq %r15, %rbx
355 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
356
357 mulq %rax
358 addq %rax, %r13
359 adcq %rdx, %r14
360 adcq \$0, %r15
361
362 movq %r13, 48(%rsp)
363 movq %r14, 56(%rsp)
364 shrq \$63, %rbx
365
366 #fifth iteration
367 movq 32($inp), %r11
368 movq 40($inp), %rax
369 mulq %r11
370 addq %rax, %r8
371 movq 48($inp), %rax
372 movq %rdx, %rcx
373 adcq \$0, %rcx
374
375 mulq %r11
376 addq %rax, %r9
377 movq 56($inp), %rax
378 adcq \$0, %rdx
379 movq %r8, %r12
380 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
381 addq %rcx, %r9
382 movq %rdx, %rcx
383 adcq \$0, %rcx
384
385 mulq %r11
386 shrq \$63, %r12
387 addq %rax, %r10
388 movq %r11, %rax
389 adcq \$0, %rdx
390 addq %rcx, %r10
391 movq %rdx, %r11
392 adcq \$0, %r11
393
394 movq %r9, %rcx
395 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
396
397 mulq %rax
398 addq %rax, %r15
399 adcq %rdx, %r8
400 adcq \$0, %r9
401
402 movq %r15, 64(%rsp)
403 movq %r8, 72(%rsp)
404 shrq \$63, %rcx
405
406 #sixth iteration
407 movq 40($inp), %r12
408 movq 48($inp), %rax
409 mulq %r12
410 addq %rax, %r10
411 movq 56($inp), %rax
412 movq %rdx, %rbx
413 adcq \$0, %rbx
414
415 mulq %r12
416 addq %rax, %r11
417 movq %r12, %rax
418 movq %r10, %r15
419 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
420 adcq \$0, %rdx
421 shrq \$63, %r15
422 addq %rbx, %r11
423 movq %rdx, %r12
424 adcq \$0, %r12
425
426 movq %r11, %rbx
427 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
428
429 mulq %rax
430 addq %rax, %r9
431 adcq %rdx, %r10
432 adcq \$0, %r11
433
434 movq %r9, 80(%rsp)
435 movq %r10, 88(%rsp)
436
437 #seventh iteration
438 movq 48($inp), %r13
439 movq 56($inp), %rax
440 mulq %r13
441 addq %rax, %r12
442 movq %r13, %rax
443 movq %rdx, %r13
444 adcq \$0, %r13
445
446 xorq %r14, %r14
447 shlq \$1, %rbx
448 adcq %r12, %r12 #shld \$1, %rbx, %r12
449 adcq %r13, %r13 #shld \$1, %r12, %r13
450 adcq %r14, %r14 #shld \$1, %r13, %r14
451
452 mulq %rax
453 addq %rax, %r11
454 adcq %rdx, %r12
455 adcq \$0, %r13
456
457 movq %r11, 96(%rsp)
458 movq %r12, 104(%rsp)
459
460 #eighth iteration
461 movq 56($inp), %rax
462 mulq %rax
463 addq %rax, %r13
464 adcq \$0, %rdx
465
466 addq %rdx, %r14
467
468 movq %r13, 112(%rsp)
469 movq %r14, 120(%rsp)
470
471 movq (%rsp), %r8
472 movq 8(%rsp), %r9
473 movq 16(%rsp), %r10
474 movq 24(%rsp), %r11
475 movq 32(%rsp), %r12
476 movq 40(%rsp), %r13
477 movq 48(%rsp), %r14
478 movq 56(%rsp), %r15
479
480 call __rsaz_512_reduce
481
482 addq 64(%rsp), %r8
483 adcq 72(%rsp), %r9
484 adcq 80(%rsp), %r10
485 adcq 88(%rsp), %r11
486 adcq 96(%rsp), %r12
487 adcq 104(%rsp), %r13
488 adcq 112(%rsp), %r14
489 adcq 120(%rsp), %r15
490 sbbq %rcx, %rcx
491
492 call __rsaz_512_subtract
493
494 movq %r8, %rdx
495 movq %r9, %rax
496 movl 128+8(%rsp), $times
497 movq $out, $inp
498
499 decl $times
500 jnz .Loop_sqr
501 ___
502 if ($addx) {
503 $code.=<<___;
504 jmp .Lsqr_tail
505
506 .align 32
507 .Loop_sqrx:
508 movl $times,128+8(%rsp)
509 movq $out, %xmm0 # off-load
510 movq %rbp, %xmm1 # off-load
511 #first iteration
512 mulx %rax, %r8, %r9
513
514 mulx 16($inp), %rcx, %r10
515 xor %rbp, %rbp # cf=0, of=0
516
517 mulx 24($inp), %rax, %r11
518 adcx %rcx, %r9
519
520 mulx 32($inp), %rcx, %r12
521 adcx %rax, %r10
522
523 mulx 40($inp), %rax, %r13
524 adcx %rcx, %r11
525
526 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
527 adcx %rax, %r12
528 adcx %rcx, %r13
529
530 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
531 adcx %rax, %r14
532 adcx %rbp, %r15 # %rbp is 0
533
534 mov %r9, %rcx
535 shld \$1, %r8, %r9
536 shl \$1, %r8
537
538 xor %ebp, %ebp
539 mulx %rdx, %rax, %rdx
540 adcx %rdx, %r8
541 mov 8($inp), %rdx
542 adcx %rbp, %r9
543
544 mov %rax, (%rsp)
545 mov %r8, 8(%rsp)
546
547 #second iteration
548 mulx 16($inp), %rax, %rbx
549 adox %rax, %r10
550 adcx %rbx, %r11
551
552 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
553 adox $out, %r11
554 adcx %r8, %r12
555
556 mulx 32($inp), %rax, %rbx
557 adox %rax, %r12
558 adcx %rbx, %r13
559
560 mulx 40($inp), $out, %r8
561 adox $out, %r13
562 adcx %r8, %r14
563
564 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
565 adox %rax, %r14
566 adcx %rbx, %r15
567
568 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
569 adox $out, %r15
570 adcx %rbp, %r8
571 adox %rbp, %r8
572
573 mov %r11, %rbx
574 shld \$1, %r10, %r11
575 shld \$1, %rcx, %r10
576
577 xor %ebp,%ebp
578 mulx %rdx, %rax, %rcx
579 mov 16($inp), %rdx
580 adcx %rax, %r9
581 adcx %rcx, %r10
582 adcx %rbp, %r11
583
584 mov %r9, 16(%rsp)
585 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
586
587 #third iteration
588 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
589 adox $out, %r12
590 adcx %r9, %r13
591
592 mulx 32($inp), %rax, %rcx
593 adox %rax, %r13
594 adcx %rcx, %r14
595
596 mulx 40($inp), $out, %r9
597 adox $out, %r14
598 adcx %r9, %r15
599
600 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
601 adox %rax, %r15
602 adcx %rcx, %r8
603
604 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
605 adox $out, %r8
606 adcx %rbp, %r9
607 adox %rbp, %r9
608
609 mov %r13, %rcx
610 shld \$1, %r12, %r13
611 shld \$1, %rbx, %r12
612
613 xor %ebp, %ebp
614 mulx %rdx, %rax, %rdx
615 adcx %rax, %r11
616 adcx %rdx, %r12
617 mov 24($inp), %rdx
618 adcx %rbp, %r13
619
620 mov %r11, 32(%rsp)
621 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
622
623 #fourth iteration
624 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
625 adox %rax, %r14
626 adcx %rbx, %r15
627
628 mulx 40($inp), $out, %r10
629 adox $out, %r15
630 adcx %r10, %r8
631
632 mulx 48($inp), %rax, %rbx
633 adox %rax, %r8
634 adcx %rbx, %r9
635
636 mulx 56($inp), $out, %r10
637 adox $out, %r9
638 adcx %rbp, %r10
639 adox %rbp, %r10
640
641 .byte 0x66
642 mov %r15, %rbx
643 shld \$1, %r14, %r15
644 shld \$1, %rcx, %r14
645
646 xor %ebp, %ebp
647 mulx %rdx, %rax, %rdx
648 adcx %rax, %r13
649 adcx %rdx, %r14
650 mov 32($inp), %rdx
651 adcx %rbp, %r15
652
653 mov %r13, 48(%rsp)
654 mov %r14, 56(%rsp)
655
656 #fifth iteration
657 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
658 adox $out, %r8
659 adcx %r11, %r9
660
661 mulx 48($inp), %rax, %rcx
662 adox %rax, %r9
663 adcx %rcx, %r10
664
665 mulx 56($inp), $out, %r11
666 adox $out, %r10
667 adcx %rbp, %r11
668 adox %rbp, %r11
669
670 mov %r9, %rcx
671 shld \$1, %r8, %r9
672 shld \$1, %rbx, %r8
673
674 xor %ebp, %ebp
675 mulx %rdx, %rax, %rdx
676 adcx %rax, %r15
677 adcx %rdx, %r8
678 mov 40($inp), %rdx
679 adcx %rbp, %r9
680
681 mov %r15, 64(%rsp)
682 mov %r8, 72(%rsp)
683
684 #sixth iteration
685 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
686 adox %rax, %r10
687 adcx %rbx, %r11
688
689 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
690 adox $out, %r11
691 adcx %rbp, %r12
692 adox %rbp, %r12
693
694 mov %r11, %rbx
695 shld \$1, %r10, %r11
696 shld \$1, %rcx, %r10
697
698 xor %ebp, %ebp
699 mulx %rdx, %rax, %rdx
700 adcx %rax, %r9
701 adcx %rdx, %r10
702 mov 48($inp), %rdx
703 adcx %rbp, %r11
704
705 mov %r9, 80(%rsp)
706 mov %r10, 88(%rsp)
707
708 #seventh iteration
709 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
710 adox %rax, %r12
711 adox %rbp, %r13
712
713 xor %r14, %r14
714 shld \$1, %r13, %r14
715 shld \$1, %r12, %r13
716 shld \$1, %rbx, %r12
717
718 xor %ebp, %ebp
719 mulx %rdx, %rax, %rdx
720 adcx %rax, %r11
721 adcx %rdx, %r12
722 mov 56($inp), %rdx
723 adcx %rbp, %r13
724
725 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
726 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
727
728 #eighth iteration
729 mulx %rdx, %rax, %rdx
730 adox %rax, %r13
731 adox %rbp, %rdx
732
733 .byte 0x66
734 add %rdx, %r14
735
736 movq %r13, 112(%rsp)
737 movq %r14, 120(%rsp)
738 movq %xmm0, $out
739 movq %xmm1, %rbp
740
741 movq 128(%rsp), %rdx # pull $n0
742 movq (%rsp), %r8
743 movq 8(%rsp), %r9
744 movq 16(%rsp), %r10
745 movq 24(%rsp), %r11
746 movq 32(%rsp), %r12
747 movq 40(%rsp), %r13
748 movq 48(%rsp), %r14
749 movq 56(%rsp), %r15
750
751 call __rsaz_512_reducex
752
753 addq 64(%rsp), %r8
754 adcq 72(%rsp), %r9
755 adcq 80(%rsp), %r10
756 adcq 88(%rsp), %r11
757 adcq 96(%rsp), %r12
758 adcq 104(%rsp), %r13
759 adcq 112(%rsp), %r14
760 adcq 120(%rsp), %r15
761 sbbq %rcx, %rcx
762
763 call __rsaz_512_subtract
764
765 movq %r8, %rdx
766 movq %r9, %rax
767 movl 128+8(%rsp), $times
768 movq $out, $inp
769
770 decl $times
771 jnz .Loop_sqrx
772
773 .Lsqr_tail:
774 ___
775 }
776 $code.=<<___;
777
778 leaq 128+24+48(%rsp), %rax
779 movq -48(%rax), %r15
780 movq -40(%rax), %r14
781 movq -32(%rax), %r13
782 movq -24(%rax), %r12
783 movq -16(%rax), %rbp
784 movq -8(%rax), %rbx
785 leaq (%rax), %rsp
786 .Lsqr_epilogue:
787 ret
788 .size rsaz_512_sqr,.-rsaz_512_sqr
789 ___
790 }
791 {
792 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
793 $code.=<<___;
794 .globl rsaz_512_mul
795 .type rsaz_512_mul,\@function,5
796 .align 32
797 rsaz_512_mul:
798 push %rbx
799 push %rbp
800 push %r12
801 push %r13
802 push %r14
803 push %r15
804
805 subq \$128+24, %rsp
806 .Lmul_body:
807 movq $out, %xmm0 # off-load arguments
808 movq $mod, %xmm1
809 movq $n0, 128(%rsp)
810 ___
811 $code.=<<___ if ($addx);
812 movl \$0x80100,%r11d
813 andl OPENSSL_ia32cap_P+8(%rip),%r11d
814 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
815 je .Lmulx
816 ___
817 $code.=<<___;
818 movq ($bp), %rbx # pass b[0]
819 movq $bp, %rbp # pass argument
820 call __rsaz_512_mul
821
822 movq %xmm0, $out
823 movq %xmm1, %rbp
824
825 movq (%rsp), %r8
826 movq 8(%rsp), %r9
827 movq 16(%rsp), %r10
828 movq 24(%rsp), %r11
829 movq 32(%rsp), %r12
830 movq 40(%rsp), %r13
831 movq 48(%rsp), %r14
832 movq 56(%rsp), %r15
833
834 call __rsaz_512_reduce
835 ___
836 $code.=<<___ if ($addx);
837 jmp .Lmul_tail
838
839 .align 32
840 .Lmulx:
841 movq $bp, %rbp # pass argument
842 movq ($bp), %rdx # pass b[0]
843 call __rsaz_512_mulx
844
845 movq %xmm0, $out
846 movq %xmm1, %rbp
847
848 movq 128(%rsp), %rdx # pull $n0
849 movq (%rsp), %r8
850 movq 8(%rsp), %r9
851 movq 16(%rsp), %r10
852 movq 24(%rsp), %r11
853 movq 32(%rsp), %r12
854 movq 40(%rsp), %r13
855 movq 48(%rsp), %r14
856 movq 56(%rsp), %r15
857
858 call __rsaz_512_reducex
859 .Lmul_tail:
860 ___
861 $code.=<<___;
862 addq 64(%rsp), %r8
863 adcq 72(%rsp), %r9
864 adcq 80(%rsp), %r10
865 adcq 88(%rsp), %r11
866 adcq 96(%rsp), %r12
867 adcq 104(%rsp), %r13
868 adcq 112(%rsp), %r14
869 adcq 120(%rsp), %r15
870 sbbq %rcx, %rcx
871
872 call __rsaz_512_subtract
873
874 leaq 128+24+48(%rsp), %rax
875 movq -48(%rax), %r15
876 movq -40(%rax), %r14
877 movq -32(%rax), %r13
878 movq -24(%rax), %r12
879 movq -16(%rax), %rbp
880 movq -8(%rax), %rbx
881 leaq (%rax), %rsp
882 .Lmul_epilogue:
883 ret
884 .size rsaz_512_mul,.-rsaz_512_mul
885 ___
886 }
887 {
888 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
889 $code.=<<___;
890 .globl rsaz_512_mul_gather4
891 .type rsaz_512_mul_gather4,\@function,6
892 .align 32
893 rsaz_512_mul_gather4:
894 push %rbx
895 push %rbp
896 push %r12
897 push %r13
898 push %r14
899 push %r15
900
901 subq \$128+24, %rsp
902 .Lmul_gather4_body:
903 ___
904 $code.=<<___ if ($addx);
905 movl \$0x80100,%r11d
906 andl OPENSSL_ia32cap_P+8(%rip),%r11d
907 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
908 je .Lmulx_gather
909 ___
910 $code.=<<___;
911 movl 64($bp,$pwr,4), %eax
912 movq $out, %xmm0 # off-load arguments
913 movl ($bp,$pwr,4), %ebx
914 movq $mod, %xmm1
915 movq $n0, 128(%rsp)
916
917 shlq \$32, %rax
918 or %rax, %rbx
919 movq ($ap), %rax
920 movq 8($ap), %rcx
921 leaq 128($bp,$pwr,4), %rbp
922 mulq %rbx # 0 iteration
923 movq %rax, (%rsp)
924 movq %rcx, %rax
925 movq %rdx, %r8
926
927 mulq %rbx
928 movd (%rbp), %xmm4
929 addq %rax, %r8
930 movq 16($ap), %rax
931 movq %rdx, %r9
932 adcq \$0, %r9
933
934 mulq %rbx
935 movd 64(%rbp), %xmm5
936 addq %rax, %r9
937 movq 24($ap), %rax
938 movq %rdx, %r10
939 adcq \$0, %r10
940
941 mulq %rbx
942 pslldq \$4, %xmm5
943 addq %rax, %r10
944 movq 32($ap), %rax
945 movq %rdx, %r11
946 adcq \$0, %r11
947
948 mulq %rbx
949 por %xmm5, %xmm4
950 addq %rax, %r11
951 movq 40($ap), %rax
952 movq %rdx, %r12
953 adcq \$0, %r12
954
955 mulq %rbx
956 addq %rax, %r12
957 movq 48($ap), %rax
958 movq %rdx, %r13
959 adcq \$0, %r13
960
961 mulq %rbx
962 leaq 128(%rbp), %rbp
963 addq %rax, %r13
964 movq 56($ap), %rax
965 movq %rdx, %r14
966 adcq \$0, %r14
967
968 mulq %rbx
969 movq %xmm4, %rbx
970 addq %rax, %r14
971 movq ($ap), %rax
972 movq %rdx, %r15
973 adcq \$0, %r15
974
975 leaq 8(%rsp), %rdi
976 movl \$7, %ecx
977 jmp .Loop_mul_gather
978
979 .align 32
980 .Loop_mul_gather:
981 mulq %rbx
982 addq %rax, %r8
983 movq 8($ap), %rax
984 movq %r8, (%rdi)
985 movq %rdx, %r8
986 adcq \$0, %r8
987
988 mulq %rbx
989 movd (%rbp), %xmm4
990 addq %rax, %r9
991 movq 16($ap), %rax
992 adcq \$0, %rdx
993 addq %r9, %r8
994 movq %rdx, %r9
995 adcq \$0, %r9
996
997 mulq %rbx
998 movd 64(%rbp), %xmm5
999 addq %rax, %r10
1000 movq 24($ap), %rax
1001 adcq \$0, %rdx
1002 addq %r10, %r9
1003 movq %rdx, %r10
1004 adcq \$0, %r10
1005
1006 mulq %rbx
1007 pslldq \$4, %xmm5
1008 addq %rax, %r11
1009 movq 32($ap), %rax
1010 adcq \$0, %rdx
1011 addq %r11, %r10
1012 movq %rdx, %r11
1013 adcq \$0, %r11
1014
1015 mulq %rbx
1016 por %xmm5, %xmm4
1017 addq %rax, %r12
1018 movq 40($ap), %rax
1019 adcq \$0, %rdx
1020 addq %r12, %r11
1021 movq %rdx, %r12
1022 adcq \$0, %r12
1023
1024 mulq %rbx
1025 addq %rax, %r13
1026 movq 48($ap), %rax
1027 adcq \$0, %rdx
1028 addq %r13, %r12
1029 movq %rdx, %r13
1030 adcq \$0, %r13
1031
1032 mulq %rbx
1033 addq %rax, %r14
1034 movq 56($ap), %rax
1035 adcq \$0, %rdx
1036 addq %r14, %r13
1037 movq %rdx, %r14
1038 adcq \$0, %r14
1039
1040 mulq %rbx
1041 movq %xmm4, %rbx
1042 addq %rax, %r15
1043 movq ($ap), %rax
1044 adcq \$0, %rdx
1045 addq %r15, %r14
1046 movq %rdx, %r15
1047 adcq \$0, %r15
1048
1049 leaq 128(%rbp), %rbp
1050 leaq 8(%rdi), %rdi
1051
1052 decl %ecx
1053 jnz .Loop_mul_gather
1054
1055 movq %r8, (%rdi)
1056 movq %r9, 8(%rdi)
1057 movq %r10, 16(%rdi)
1058 movq %r11, 24(%rdi)
1059 movq %r12, 32(%rdi)
1060 movq %r13, 40(%rdi)
1061 movq %r14, 48(%rdi)
1062 movq %r15, 56(%rdi)
1063
1064 movq %xmm0, $out
1065 movq %xmm1, %rbp
1066
1067 movq (%rsp), %r8
1068 movq 8(%rsp), %r9
1069 movq 16(%rsp), %r10
1070 movq 24(%rsp), %r11
1071 movq 32(%rsp), %r12
1072 movq 40(%rsp), %r13
1073 movq 48(%rsp), %r14
1074 movq 56(%rsp), %r15
1075
1076 call __rsaz_512_reduce
1077 ___
1078 $code.=<<___ if ($addx);
1079 jmp .Lmul_gather_tail
1080
1081 .align 32
1082 .Lmulx_gather:
1083 mov 64($bp,$pwr,4), %eax
1084 movq $out, %xmm0 # off-load arguments
1085 lea 128($bp,$pwr,4), %rbp
1086 mov ($bp,$pwr,4), %edx
1087 movq $mod, %xmm1
1088 mov $n0, 128(%rsp)
1089
1090 shl \$32, %rax
1091 or %rax, %rdx
1092 mulx ($ap), %rbx, %r8 # 0 iteration
1093 mov %rbx, (%rsp)
1094 xor %edi, %edi # cf=0, of=0
1095
1096 mulx 8($ap), %rax, %r9
1097 movd (%rbp), %xmm4
1098
1099 mulx 16($ap), %rbx, %r10
1100 movd 64(%rbp), %xmm5
1101 adcx %rax, %r8
1102
1103 mulx 24($ap), %rax, %r11
1104 pslldq \$4, %xmm5
1105 adcx %rbx, %r9
1106
1107 mulx 32($ap), %rbx, %r12
1108 por %xmm5, %xmm4
1109 adcx %rax, %r10
1110
1111 mulx 40($ap), %rax, %r13
1112 adcx %rbx, %r11
1113
1114 mulx 48($ap), %rbx, %r14
1115 lea 128(%rbp), %rbp
1116 adcx %rax, %r12
1117
1118 mulx 56($ap), %rax, %r15
1119 movq %xmm4, %rdx
1120 adcx %rbx, %r13
1121 adcx %rax, %r14
1122 mov %r8, %rbx
1123 adcx %rdi, %r15 # %rdi is 0
1124
1125 mov \$-7, %rcx
1126 jmp .Loop_mulx_gather
1127
1128 .align 32
1129 .Loop_mulx_gather:
1130 mulx ($ap), %rax, %r8
1131 adcx %rax, %rbx
1132 adox %r9, %r8
1133
1134 mulx 8($ap), %rax, %r9
1135 .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
1136 adcx %rax, %r8
1137 adox %r10, %r9
1138
1139 mulx 16($ap), %rax, %r10
1140 movd 64(%rbp), %xmm5
1141 lea 128(%rbp), %rbp
1142 adcx %rax, %r9
1143 adox %r11, %r10
1144
1145 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1146 pslldq \$4, %xmm5
1147 por %xmm5, %xmm4
1148 adcx %rax, %r10
1149 adox %r12, %r11
1150
1151 mulx 32($ap), %rax, %r12
1152 adcx %rax, %r11
1153 adox %r13, %r12
1154
1155 mulx 40($ap), %rax, %r13
1156 adcx %rax, %r12
1157 adox %r14, %r13
1158
1159 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1160 adcx %rax, %r13
1161 adox %r15, %r14
1162
1163 mulx 56($ap), %rax, %r15
1164 movq %xmm4, %rdx
1165 mov %rbx, 64(%rsp,%rcx,8)
1166 adcx %rax, %r14
1167 adox %rdi, %r15
1168 mov %r8, %rbx
1169 adcx %rdi, %r15 # cf=0
1170
1171 inc %rcx # of=0
1172 jnz .Loop_mulx_gather
1173
1174 mov %r8, 64(%rsp)
1175 mov %r9, 64+8(%rsp)
1176 mov %r10, 64+16(%rsp)
1177 mov %r11, 64+24(%rsp)
1178 mov %r12, 64+32(%rsp)
1179 mov %r13, 64+40(%rsp)
1180 mov %r14, 64+48(%rsp)
1181 mov %r15, 64+56(%rsp)
1182
1183 movq %xmm0, $out
1184 movq %xmm1, %rbp
1185
1186 mov 128(%rsp), %rdx # pull $n0
1187 mov (%rsp), %r8
1188 mov 8(%rsp), %r9
1189 mov 16(%rsp), %r10
1190 mov 24(%rsp), %r11
1191 mov 32(%rsp), %r12
1192 mov 40(%rsp), %r13
1193 mov 48(%rsp), %r14
1194 mov 56(%rsp), %r15
1195
1196 call __rsaz_512_reducex
1197
1198 .Lmul_gather_tail:
1199 ___
1200 $code.=<<___;
1201 addq 64(%rsp), %r8
1202 adcq 72(%rsp), %r9
1203 adcq 80(%rsp), %r10
1204 adcq 88(%rsp), %r11
1205 adcq 96(%rsp), %r12
1206 adcq 104(%rsp), %r13
1207 adcq 112(%rsp), %r14
1208 adcq 120(%rsp), %r15
1209 sbbq %rcx, %rcx
1210
1211 call __rsaz_512_subtract
1212
1213 leaq 128+24+48(%rsp), %rax
1214 movq -48(%rax), %r15
1215 movq -40(%rax), %r14
1216 movq -32(%rax), %r13
1217 movq -24(%rax), %r12
1218 movq -16(%rax), %rbp
1219 movq -8(%rax), %rbx
1220 leaq (%rax), %rsp
1221 .Lmul_gather4_epilogue:
1222 ret
1223 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1224 ___
1225 }
1226 {
1227 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1228 $code.=<<___;
1229 .globl rsaz_512_mul_scatter4
1230 .type rsaz_512_mul_scatter4,\@function,6
1231 .align 32
1232 rsaz_512_mul_scatter4:
1233 push %rbx
1234 push %rbp
1235 push %r12
1236 push %r13
1237 push %r14
1238 push %r15
1239
1240 subq \$128+24, %rsp
1241 .Lmul_scatter4_body:
1242 leaq ($tbl,$pwr,4), $tbl
1243 movq $out, %xmm0 # off-load arguments
1244 movq $mod, %xmm1
1245 movq $tbl, %xmm2
1246 movq $n0, 128(%rsp)
1247
1248 movq $out, %rbp
1249 ___
1250 $code.=<<___ if ($addx);
1251 movl \$0x80100,%r11d
1252 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1253 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1254 je .Lmulx_scatter
1255 ___
1256 $code.=<<___;
1257 movq ($out),%rbx # pass b[0]
1258 call __rsaz_512_mul
1259
1260 movq %xmm0, $out
1261 movq %xmm1, %rbp
1262
1263 movq (%rsp), %r8
1264 movq 8(%rsp), %r9
1265 movq 16(%rsp), %r10
1266 movq 24(%rsp), %r11
1267 movq 32(%rsp), %r12
1268 movq 40(%rsp), %r13
1269 movq 48(%rsp), %r14
1270 movq 56(%rsp), %r15
1271
1272 call __rsaz_512_reduce
1273 ___
1274 $code.=<<___ if ($addx);
1275 jmp .Lmul_scatter_tail
1276
1277 .align 32
1278 .Lmulx_scatter:
1279 movq ($out), %rdx # pass b[0]
1280 call __rsaz_512_mulx
1281
1282 movq %xmm0, $out
1283 movq %xmm1, %rbp
1284
1285 movq 128(%rsp), %rdx # pull $n0
1286 movq (%rsp), %r8
1287 movq 8(%rsp), %r9
1288 movq 16(%rsp), %r10
1289 movq 24(%rsp), %r11
1290 movq 32(%rsp), %r12
1291 movq 40(%rsp), %r13
1292 movq 48(%rsp), %r14
1293 movq 56(%rsp), %r15
1294
1295 call __rsaz_512_reducex
1296
1297 .Lmul_scatter_tail:
1298 ___
1299 $code.=<<___;
1300 addq 64(%rsp), %r8
1301 adcq 72(%rsp), %r9
1302 adcq 80(%rsp), %r10
1303 adcq 88(%rsp), %r11
1304 adcq 96(%rsp), %r12
1305 adcq 104(%rsp), %r13
1306 adcq 112(%rsp), %r14
1307 adcq 120(%rsp), %r15
1308 movq %xmm2, $inp
1309 sbbq %rcx, %rcx
1310
1311 call __rsaz_512_subtract
1312
1313 movl %r8d, 64*0($inp) # scatter
1314 shrq \$32, %r8
1315 movl %r9d, 64*2($inp)
1316 shrq \$32, %r9
1317 movl %r10d, 64*4($inp)
1318 shrq \$32, %r10
1319 movl %r11d, 64*6($inp)
1320 shrq \$32, %r11
1321 movl %r12d, 64*8($inp)
1322 shrq \$32, %r12
1323 movl %r13d, 64*10($inp)
1324 shrq \$32, %r13
1325 movl %r14d, 64*12($inp)
1326 shrq \$32, %r14
1327 movl %r15d, 64*14($inp)
1328 shrq \$32, %r15
1329 movl %r8d, 64*1($inp)
1330 movl %r9d, 64*3($inp)
1331 movl %r10d, 64*5($inp)
1332 movl %r11d, 64*7($inp)
1333 movl %r12d, 64*9($inp)
1334 movl %r13d, 64*11($inp)
1335 movl %r14d, 64*13($inp)
1336 movl %r15d, 64*15($inp)
1337
1338 leaq 128+24+48(%rsp), %rax
1339 movq -48(%rax), %r15
1340 movq -40(%rax), %r14
1341 movq -32(%rax), %r13
1342 movq -24(%rax), %r12
1343 movq -16(%rax), %rbp
1344 movq -8(%rax), %rbx
1345 leaq (%rax), %rsp
1346 .Lmul_scatter4_epilogue:
1347 ret
1348 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1349 ___
1350 }
1351 {
1352 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1353 $code.=<<___;
1354 .globl rsaz_512_mul_by_one
1355 .type rsaz_512_mul_by_one,\@function,4
1356 .align 32
1357 rsaz_512_mul_by_one:
1358 push %rbx
1359 push %rbp
1360 push %r12
1361 push %r13
1362 push %r14
1363 push %r15
1364
1365 subq \$128+24, %rsp
1366 .Lmul_by_one_body:
1367 ___
1368 $code.=<<___ if ($addx);
1369 movl OPENSSL_ia32cap_P+8(%rip),%eax
1370 ___
1371 $code.=<<___;
1372 movq $mod, %rbp # reassign argument
1373 movq $n0, 128(%rsp)
1374
1375 movq ($inp), %r8
1376 pxor %xmm0, %xmm0
1377 movq 8($inp), %r9
1378 movq 16($inp), %r10
1379 movq 24($inp), %r11
1380 movq 32($inp), %r12
1381 movq 40($inp), %r13
1382 movq 48($inp), %r14
1383 movq 56($inp), %r15
1384
1385 movdqa %xmm0, (%rsp)
1386 movdqa %xmm0, 16(%rsp)
1387 movdqa %xmm0, 32(%rsp)
1388 movdqa %xmm0, 48(%rsp)
1389 movdqa %xmm0, 64(%rsp)
1390 movdqa %xmm0, 80(%rsp)
1391 movdqa %xmm0, 96(%rsp)
1392 ___
1393 $code.=<<___ if ($addx);
1394 andl \$0x80100,%eax
1395 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1396 je .Lby_one_callx
1397 ___
1398 $code.=<<___;
1399 call __rsaz_512_reduce
1400 ___
1401 $code.=<<___ if ($addx);
1402 jmp .Lby_one_tail
1403 .align 32
1404 .Lby_one_callx:
1405 movq 128(%rsp), %rdx # pull $n0
1406 call __rsaz_512_reducex
1407 .Lby_one_tail:
1408 ___
1409 $code.=<<___;
1410 movq %r8, ($out)
1411 movq %r9, 8($out)
1412 movq %r10, 16($out)
1413 movq %r11, 24($out)
1414 movq %r12, 32($out)
1415 movq %r13, 40($out)
1416 movq %r14, 48($out)
1417 movq %r15, 56($out)
1418
1419 leaq 128+24+48(%rsp), %rax
1420 movq -48(%rax), %r15
1421 movq -40(%rax), %r14
1422 movq -32(%rax), %r13
1423 movq -24(%rax), %r12
1424 movq -16(%rax), %rbp
1425 movq -8(%rax), %rbx
1426 leaq (%rax), %rsp
1427 .Lmul_by_one_epilogue:
1428 ret
1429 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1430 ___
1431 }
1432 { # __rsaz_512_reduce
1433 #
1434 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1435 # output: %r8-%r15
1436 # clobbers: everything except %rbp and %rdi
1437 $code.=<<___;
1438 .type __rsaz_512_reduce,\@abi-omnipotent
1439 .align 32
1440 __rsaz_512_reduce:
1441 movq %r8, %rbx
1442 imulq 128+8(%rsp), %rbx
1443 movq 0(%rbp), %rax
1444 movl \$8, %ecx
1445 jmp .Lreduction_loop
1446
1447 .align 32
1448 .Lreduction_loop:
1449 mulq %rbx
1450 movq 8(%rbp), %rax
1451 negq %r8
1452 movq %rdx, %r8
1453 adcq \$0, %r8
1454
1455 mulq %rbx
1456 addq %rax, %r9
1457 movq 16(%rbp), %rax
1458 adcq \$0, %rdx
1459 addq %r9, %r8
1460 movq %rdx, %r9
1461 adcq \$0, %r9
1462
1463 mulq %rbx
1464 addq %rax, %r10
1465 movq 24(%rbp), %rax
1466 adcq \$0, %rdx
1467 addq %r10, %r9
1468 movq %rdx, %r10
1469 adcq \$0, %r10
1470
1471 mulq %rbx
1472 addq %rax, %r11
1473 movq 32(%rbp), %rax
1474 adcq \$0, %rdx
1475 addq %r11, %r10
1476 movq 128+8(%rsp), %rsi
1477 #movq %rdx, %r11
1478 #adcq \$0, %r11
1479 adcq \$0, %rdx
1480 movq %rdx, %r11
1481
1482 mulq %rbx
1483 addq %rax, %r12
1484 movq 40(%rbp), %rax
1485 adcq \$0, %rdx
1486 imulq %r8, %rsi
1487 addq %r12, %r11
1488 movq %rdx, %r12
1489 adcq \$0, %r12
1490
1491 mulq %rbx
1492 addq %rax, %r13
1493 movq 48(%rbp), %rax
1494 adcq \$0, %rdx
1495 addq %r13, %r12
1496 movq %rdx, %r13
1497 adcq \$0, %r13
1498
1499 mulq %rbx
1500 addq %rax, %r14
1501 movq 56(%rbp), %rax
1502 adcq \$0, %rdx
1503 addq %r14, %r13
1504 movq %rdx, %r14
1505 adcq \$0, %r14
1506
1507 mulq %rbx
1508 movq %rsi, %rbx
1509 addq %rax, %r15
1510 movq 0(%rbp), %rax
1511 adcq \$0, %rdx
1512 addq %r15, %r14
1513 movq %rdx, %r15
1514 adcq \$0, %r15
1515
1516 decl %ecx
1517 jne .Lreduction_loop
1518
1519 ret
1520 .size __rsaz_512_reduce,.-__rsaz_512_reduce
1521 ___
1522 }
1523 if ($addx) {
1524 # __rsaz_512_reducex
1525 #
1526 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1527 # output: %r8-%r15
1528 # clobbers: everything except %rbp and %rdi
1529 $code.=<<___;
1530 .type __rsaz_512_reducex,\@abi-omnipotent
1531 .align 32
1532 __rsaz_512_reducex:
1533 #movq 128+8(%rsp), %rdx # pull $n0
1534 imulq %r8, %rdx
1535 xorq %rsi, %rsi # cf=0,of=0
1536 movl \$8, %ecx
1537 jmp .Lreduction_loopx
1538
1539 .align 32
1540 .Lreduction_loopx:
1541 mov %r8, %rbx
1542 mulx 0(%rbp), %rax, %r8
1543 adcx %rbx, %rax
1544 adox %r9, %r8
1545
1546 mulx 8(%rbp), %rax, %r9
1547 adcx %rax, %r8
1548 adox %r10, %r9
1549
1550 mulx 16(%rbp), %rbx, %r10
1551 adcx %rbx, %r9
1552 adox %r11, %r10
1553
1554 mulx 24(%rbp), %rbx, %r11
1555 adcx %rbx, %r10
1556 adox %r12, %r11
1557
1558 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1559 mov %rdx, %rax
1560 mov %r8, %rdx
1561 adcx %rbx, %r11
1562 adox %r13, %r12
1563
1564 mulx 128+8(%rsp), %rbx, %rdx
1565 mov %rax, %rdx
1566
1567 mulx 40(%rbp), %rax, %r13
1568 adcx %rax, %r12
1569 adox %r14, %r13
1570
1571 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1572 adcx %rax, %r13
1573 adox %r15, %r14
1574
1575 mulx 56(%rbp), %rax, %r15
1576 mov %rbx, %rdx
1577 adcx %rax, %r14
1578 adox %rsi, %r15 # %rsi is 0
1579 adcx %rsi, %r15 # cf=0
1580
1581 decl %ecx # of=0
1582 jne .Lreduction_loopx
1583
1584 ret
1585 .size __rsaz_512_reducex,.-__rsaz_512_reducex
1586 ___
1587 }
1588 { # __rsaz_512_subtract
1589 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1590 # output:
1591 # clobbers: everything but %rdi, %rsi and %rbp
1592 $code.=<<___;
1593 .type __rsaz_512_subtract,\@abi-omnipotent
1594 .align 32
1595 __rsaz_512_subtract:
1596 movq %r8, ($out)
1597 movq %r9, 8($out)
1598 movq %r10, 16($out)
1599 movq %r11, 24($out)
1600 movq %r12, 32($out)
1601 movq %r13, 40($out)
1602 movq %r14, 48($out)
1603 movq %r15, 56($out)
1604
1605 movq 0($mod), %r8
1606 movq 8($mod), %r9
1607 negq %r8
1608 notq %r9
1609 andq %rcx, %r8
1610 movq 16($mod), %r10
1611 andq %rcx, %r9
1612 notq %r10
1613 movq 24($mod), %r11
1614 andq %rcx, %r10
1615 notq %r11
1616 movq 32($mod), %r12
1617 andq %rcx, %r11
1618 notq %r12
1619 movq 40($mod), %r13
1620 andq %rcx, %r12
1621 notq %r13
1622 movq 48($mod), %r14
1623 andq %rcx, %r13
1624 notq %r14
1625 movq 56($mod), %r15
1626 andq %rcx, %r14
1627 notq %r15
1628 andq %rcx, %r15
1629
1630 addq ($out), %r8
1631 adcq 8($out), %r9
1632 adcq 16($out), %r10
1633 adcq 24($out), %r11
1634 adcq 32($out), %r12
1635 adcq 40($out), %r13
1636 adcq 48($out), %r14
1637 adcq 56($out), %r15
1638
1639 movq %r8, ($out)
1640 movq %r9, 8($out)
1641 movq %r10, 16($out)
1642 movq %r11, 24($out)
1643 movq %r12, 32($out)
1644 movq %r13, 40($out)
1645 movq %r14, 48($out)
1646 movq %r15, 56($out)
1647
1648 ret
1649 .size __rsaz_512_subtract,.-__rsaz_512_subtract
1650 ___
1651 }
1652 { # __rsaz_512_mul
1653 #
1654 # input: %rsi - ap, %rbp - bp
1655 # ouput:
1656 # clobbers: everything
1657 my ($ap,$bp) = ("%rsi","%rbp");
1658 $code.=<<___;
1659 .type __rsaz_512_mul,\@abi-omnipotent
1660 .align 32
1661 __rsaz_512_mul:
1662 leaq 8(%rsp), %rdi
1663
1664 movq ($ap), %rax
1665 mulq %rbx
1666 movq %rax, (%rdi)
1667 movq 8($ap), %rax
1668 movq %rdx, %r8
1669
1670 mulq %rbx
1671 addq %rax, %r8
1672 movq 16($ap), %rax
1673 movq %rdx, %r9
1674 adcq \$0, %r9
1675
1676 mulq %rbx
1677 addq %rax, %r9
1678 movq 24($ap), %rax
1679 movq %rdx, %r10
1680 adcq \$0, %r10
1681
1682 mulq %rbx
1683 addq %rax, %r10
1684 movq 32($ap), %rax
1685 movq %rdx, %r11
1686 adcq \$0, %r11
1687
1688 mulq %rbx
1689 addq %rax, %r11
1690 movq 40($ap), %rax
1691 movq %rdx, %r12
1692 adcq \$0, %r12
1693
1694 mulq %rbx
1695 addq %rax, %r12
1696 movq 48($ap), %rax
1697 movq %rdx, %r13
1698 adcq \$0, %r13
1699
1700 mulq %rbx
1701 addq %rax, %r13
1702 movq 56($ap), %rax
1703 movq %rdx, %r14
1704 adcq \$0, %r14
1705
1706 mulq %rbx
1707 addq %rax, %r14
1708 movq ($ap), %rax
1709 movq %rdx, %r15
1710 adcq \$0, %r15
1711
1712 leaq 8($bp), $bp
1713 leaq 8(%rdi), %rdi
1714
1715 movl \$7, %ecx
1716 jmp .Loop_mul
1717
1718 .align 32
1719 .Loop_mul:
1720 movq ($bp), %rbx
1721 mulq %rbx
1722 addq %rax, %r8
1723 movq 8($ap), %rax
1724 movq %r8, (%rdi)
1725 movq %rdx, %r8
1726 adcq \$0, %r8
1727
1728 mulq %rbx
1729 addq %rax, %r9
1730 movq 16($ap), %rax
1731 adcq \$0, %rdx
1732 addq %r9, %r8
1733 movq %rdx, %r9
1734 adcq \$0, %r9
1735
1736 mulq %rbx
1737 addq %rax, %r10
1738 movq 24($ap), %rax
1739 adcq \$0, %rdx
1740 addq %r10, %r9
1741 movq %rdx, %r10
1742 adcq \$0, %r10
1743
1744 mulq %rbx
1745 addq %rax, %r11
1746 movq 32($ap), %rax
1747 adcq \$0, %rdx
1748 addq %r11, %r10
1749 movq %rdx, %r11
1750 adcq \$0, %r11
1751
1752 mulq %rbx
1753 addq %rax, %r12
1754 movq 40($ap), %rax
1755 adcq \$0, %rdx
1756 addq %r12, %r11
1757 movq %rdx, %r12
1758 adcq \$0, %r12
1759
1760 mulq %rbx
1761 addq %rax, %r13
1762 movq 48($ap), %rax
1763 adcq \$0, %rdx
1764 addq %r13, %r12
1765 movq %rdx, %r13
1766 adcq \$0, %r13
1767
1768 mulq %rbx
1769 addq %rax, %r14
1770 movq 56($ap), %rax
1771 adcq \$0, %rdx
1772 addq %r14, %r13
1773 movq %rdx, %r14
1774 leaq 8($bp), $bp
1775 adcq \$0, %r14
1776
1777 mulq %rbx
1778 addq %rax, %r15
1779 movq ($ap), %rax
1780 adcq \$0, %rdx
1781 addq %r15, %r14
1782 movq %rdx, %r15
1783 adcq \$0, %r15
1784
1785 leaq 8(%rdi), %rdi
1786
1787 decl %ecx
1788 jnz .Loop_mul
1789
1790 movq %r8, (%rdi)
1791 movq %r9, 8(%rdi)
1792 movq %r10, 16(%rdi)
1793 movq %r11, 24(%rdi)
1794 movq %r12, 32(%rdi)
1795 movq %r13, 40(%rdi)
1796 movq %r14, 48(%rdi)
1797 movq %r15, 56(%rdi)
1798
1799 ret
1800 .size __rsaz_512_mul,.-__rsaz_512_mul
1801 ___
1802 }
1803 if ($addx) {
1804 # __rsaz_512_mulx
1805 #
1806 # input: %rsi - ap, %rbp - bp
1807 # ouput:
1808 # clobbers: everything
1809 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1810 $code.=<<___;
1811 .type __rsaz_512_mulx,\@abi-omnipotent
1812 .align 32
1813 __rsaz_512_mulx:
1814 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
1815 xor $zero, $zero # cf=0,of=0
1816
1817 mulx 8($ap), %rax, %r9
1818 movq %rbx, 8(%rsp)
1819
1820 mulx 16($ap), %rbx, %r10
1821 adcx %rax, %r8
1822
1823 mulx 24($ap), %rax, %r11
1824 adcx %rbx, %r9
1825
1826 .byte 0xc4,0x62,0xe3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rbx, %r12
1827 adcx %rax, %r10
1828
1829 mulx 40($ap), %rax, %r13
1830 adcx %rbx, %r11
1831
1832 mulx 48($ap), %rbx, %r14
1833 adcx %rax, %r12
1834
1835 mulx 56($ap), %rax, %r15
1836 mov 8($bp), %rdx
1837 adcx %rbx, %r13
1838 adcx %rax, %r14
1839 adcx $zero, %r15 # cf=0
1840
1841 mov \$-6, %rcx
1842 jmp .Loop_mulx
1843
1844 .align 32
1845 .Loop_mulx:
1846 movq %r8, %rbx
1847 mulx ($ap), %rax, %r8
1848 adcx %rax, %rbx
1849 adox %r9, %r8
1850
1851 mulx 8($ap), %rax, %r9
1852 adcx %rax, %r8
1853 adox %r10, %r9
1854
1855 mulx 16($ap), %rax, %r10
1856 adcx %rax, %r9
1857 adox %r11, %r10
1858
1859 mulx 24($ap), %rax, %r11
1860 adcx %rax, %r10
1861 adox %r12, %r11
1862
1863 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1864 adcx %rax, %r11
1865 adox %r13, %r12
1866
1867 mulx 40($ap), %rax, %r13
1868 adcx %rax, %r12
1869 adox %r14, %r13
1870
1871 mulx 48($ap), %rax, %r14
1872 adcx %rax, %r13
1873 adox %r15, %r14
1874
1875 mulx 56($ap), %rax, %r15
1876 movq 64($bp,%rcx,8), %rdx
1877 movq %rbx, 8+64-8(%rsp,%rcx,8)
1878 adcx %rax, %r14
1879 adox $zero, %r15
1880 adcx $zero, %r15 # cf=0
1881
1882 inc %rcx # of=0
1883 jnz .Loop_mulx
1884
1885 movq %r8, %rbx
1886 mulx ($ap), %rax, %r8
1887 adcx %rax, %rbx
1888 adox %r9, %r8
1889
1890 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
1891 adcx %rax, %r8
1892 adox %r10, %r9
1893
1894 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
1895 adcx %rax, %r9
1896 adox %r11, %r10
1897
1898 mulx 24($ap), %rax, %r11
1899 adcx %rax, %r10
1900 adox %r12, %r11
1901
1902 mulx 32($ap), %rax, %r12
1903 adcx %rax, %r11
1904 adox %r13, %r12
1905
1906 mulx 40($ap), %rax, %r13
1907 adcx %rax, %r12
1908 adox %r14, %r13
1909
1910 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1911 adcx %rax, %r13
1912 adox %r15, %r14
1913
1914 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
1915 adcx %rax, %r14
1916 adox $zero, %r15
1917 adcx $zero, %r15
1918
1919 mov %rbx, 8+64-8(%rsp)
1920 mov %r8, 8+64(%rsp)
1921 mov %r9, 8+64+8(%rsp)
1922 mov %r10, 8+64+16(%rsp)
1923 mov %r11, 8+64+24(%rsp)
1924 mov %r12, 8+64+32(%rsp)
1925 mov %r13, 8+64+40(%rsp)
1926 mov %r14, 8+64+48(%rsp)
1927 mov %r15, 8+64+56(%rsp)
1928
1929 ret
1930 .size __rsaz_512_mulx,.-__rsaz_512_mulx
1931 ___
1932 }
1933 {
1934 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1935 $code.=<<___;
1936 .globl rsaz_512_scatter4
1937 .type rsaz_512_scatter4,\@abi-omnipotent
1938 .align 16
1939 rsaz_512_scatter4:
1940 leaq ($out,$power,4), $out
1941 movl \$8, %r9d
1942 jmp .Loop_scatter
1943 .align 16
1944 .Loop_scatter:
1945 movq ($inp), %rax
1946 leaq 8($inp), $inp
1947 movl %eax, ($out)
1948 shrq \$32, %rax
1949 movl %eax, 64($out)
1950 leaq 128($out), $out
1951 decl %r9d
1952 jnz .Loop_scatter
1953 ret
1954 .size rsaz_512_scatter4,.-rsaz_512_scatter4
1955
1956 .globl rsaz_512_gather4
1957 .type rsaz_512_gather4,\@abi-omnipotent
1958 .align 16
1959 rsaz_512_gather4:
1960 leaq ($inp,$power,4), $inp
1961 movl \$8, %r9d
1962 jmp .Loop_gather
1963 .align 16
1964 .Loop_gather:
1965 movl ($inp), %eax
1966 movl 64($inp), %r8d
1967 leaq 128($inp), $inp
1968 shlq \$32, %r8
1969 or %r8, %rax
1970 movq %rax, ($out)
1971 leaq 8($out), $out
1972 decl %r9d
1973 jnz .Loop_gather
1974 ret
1975 .size rsaz_512_gather4,.-rsaz_512_gather4
1976 ___
1977 }
1978
1979 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1980 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1981 if ($win64) {
1982 $rec="%rcx";
1983 $frame="%rdx";
1984 $context="%r8";
1985 $disp="%r9";
1986
1987 $code.=<<___;
1988 .extern __imp_RtlVirtualUnwind
1989 .type se_handler,\@abi-omnipotent
1990 .align 16
1991 se_handler:
1992 push %rsi
1993 push %rdi
1994 push %rbx
1995 push %rbp
1996 push %r12
1997 push %r13
1998 push %r14
1999 push %r15
2000 pushfq
2001 sub \$64,%rsp
2002
2003 mov 120($context),%rax # pull context->Rax
2004 mov 248($context),%rbx # pull context->Rip
2005
2006 mov 8($disp),%rsi # disp->ImageBase
2007 mov 56($disp),%r11 # disp->HandlerData
2008
2009 mov 0(%r11),%r10d # HandlerData[0]
2010 lea (%rsi,%r10),%r10 # end of prologue label
2011 cmp %r10,%rbx # context->Rip<end of prologue label
2012 jb .Lcommon_seh_tail
2013
2014 mov 152($context),%rax # pull context->Rsp
2015
2016 mov 4(%r11),%r10d # HandlerData[1]
2017 lea (%rsi,%r10),%r10 # epilogue label
2018 cmp %r10,%rbx # context->Rip>=epilogue label
2019 jae .Lcommon_seh_tail
2020
2021 lea 128+24+48(%rax),%rax
2022
2023 mov -8(%rax),%rbx
2024 mov -16(%rax),%rbp
2025 mov -24(%rax),%r12
2026 mov -32(%rax),%r13
2027 mov -40(%rax),%r14
2028 mov -48(%rax),%r15
2029 mov %rbx,144($context) # restore context->Rbx
2030 mov %rbp,160($context) # restore context->Rbp
2031 mov %r12,216($context) # restore context->R12
2032 mov %r13,224($context) # restore context->R13
2033 mov %r14,232($context) # restore context->R14
2034 mov %r15,240($context) # restore context->R15
2035
2036 .Lcommon_seh_tail:
2037 mov 8(%rax),%rdi
2038 mov 16(%rax),%rsi
2039 mov %rax,152($context) # restore context->Rsp
2040 mov %rsi,168($context) # restore context->Rsi
2041 mov %rdi,176($context) # restore context->Rdi
2042
2043 mov 40($disp),%rdi # disp->ContextRecord
2044 mov $context,%rsi # context
2045 mov \$154,%ecx # sizeof(CONTEXT)
2046 .long 0xa548f3fc # cld; rep movsq
2047
2048 mov $disp,%rsi
2049 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2050 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2051 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2052 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2053 mov 40(%rsi),%r10 # disp->ContextRecord
2054 lea 56(%rsi),%r11 # &disp->HandlerData
2055 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2056 mov %r10,32(%rsp) # arg5
2057 mov %r11,40(%rsp) # arg6
2058 mov %r12,48(%rsp) # arg7
2059 mov %rcx,56(%rsp) # arg8, (NULL)
2060 call *__imp_RtlVirtualUnwind(%rip)
2061
2062 mov \$1,%eax # ExceptionContinueSearch
2063 add \$64,%rsp
2064 popfq
2065 pop %r15
2066 pop %r14
2067 pop %r13
2068 pop %r12
2069 pop %rbp
2070 pop %rbx
2071 pop %rdi
2072 pop %rsi
2073 ret
2074 .size sqr_handler,.-sqr_handler
2075
2076 .section .pdata
2077 .align 4
2078 .rva .LSEH_begin_rsaz_512_sqr
2079 .rva .LSEH_end_rsaz_512_sqr
2080 .rva .LSEH_info_rsaz_512_sqr
2081
2082 .rva .LSEH_begin_rsaz_512_mul
2083 .rva .LSEH_end_rsaz_512_mul
2084 .rva .LSEH_info_rsaz_512_mul
2085
2086 .rva .LSEH_begin_rsaz_512_mul_gather4
2087 .rva .LSEH_end_rsaz_512_mul_gather4
2088 .rva .LSEH_info_rsaz_512_mul_gather4
2089
2090 .rva .LSEH_begin_rsaz_512_mul_scatter4
2091 .rva .LSEH_end_rsaz_512_mul_scatter4
2092 .rva .LSEH_info_rsaz_512_mul_scatter4
2093
2094 .rva .LSEH_begin_rsaz_512_mul_by_one
2095 .rva .LSEH_end_rsaz_512_mul_by_one
2096 .rva .LSEH_info_rsaz_512_mul_by_one
2097
2098 .section .xdata
2099 .align 8
2100 .LSEH_info_rsaz_512_sqr:
2101 .byte 9,0,0,0
2102 .rva se_handler
2103 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2104 .LSEH_info_rsaz_512_mul:
2105 .byte 9,0,0,0
2106 .rva se_handler
2107 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2108 .LSEH_info_rsaz_512_mul_gather4:
2109 .byte 9,0,0,0
2110 .rva se_handler
2111 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2112 .LSEH_info_rsaz_512_mul_scatter4:
2113 .byte 9,0,0,0
2114 .rva se_handler
2115 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2116 .LSEH_info_rsaz_512_mul_by_one:
2117 .byte 9,0,0,0
2118 .rva se_handler
2119 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2120 ___
2121 }
2122
2123 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2124 print $code;
2125 close STDOUT;