]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/rsaz-x86_64.pl
120b4734928e96ead6938222b17e4528dda98ae2
[thirdparty/openssl.git] / crypto / bn / asm / rsaz-x86_64.pl
1 #!/usr/bin/env perl
2
3 ##############################################################################
4 # #
5 # Copyright (c) 2012, Intel Corporation #
6 # #
7 # All rights reserved. #
8 # #
9 # Redistribution and use in source and binary forms, with or without #
10 # modification, are permitted provided that the following conditions are #
11 # met: #
12 # #
13 # * Redistributions of source code must retain the above copyright #
14 # notice, this list of conditions and the following disclaimer. #
15 # #
16 # * Redistributions in binary form must reproduce the above copyright #
17 # notice, this list of conditions and the following disclaimer in the #
18 # documentation and/or other materials provided with the #
19 # distribution. #
20 # #
21 # * Neither the name of the Intel Corporation nor the names of its #
22 # contributors may be used to endorse or promote products derived from #
23 # this software without specific prior written permission. #
24 # #
25 # #
26 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
37 # #
38 ##############################################################################
39 # Developers and authors: #
40 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
41 # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
42 # Israel Development Center, Haifa, Israel #
43 # (2) University of Haifa #
44 ##############################################################################
45 # Reference: #
46 # [1] S. Gueron, "Efficient Software Implementations of Modular #
47 # Exponentiation", http://eprint.iacr.org/2011/239 #
48 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
49 # IEEE Proceedings of 9th International Conference on Information #
50 # Technology: New Generations (ITNG 2012), 821-823 (2012). #
51 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
52 # Journal of Cryptographic Engineering 2:31-43 (2012). #
53 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
54 # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
55 # RSA1024 and RSA2048 on x86_64 platforms", #
56 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
57 ##############################################################################
58
59 # While original submission covers 512- and 1024-bit exponentiation,
60 # this module is limited to 512-bit version only (and as such
61 # accelerates RSA1024 sign). This is because improvement for longer
62 # keys is not high enough to justify the effort, highest measured
63 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
64 # for the moment of this writing!] Nor does this module implement
65 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
66 # to more modular mixture of C and assembly. And it's optimized even
67 # for processors other than Intel Core family (see table below for
68 # improvement coefficients).
69 # <appro@openssl.org>
70 #
71 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
72 # ----------------+---------------------------
73 # Opteron +13% |+5% +20%
74 # Bulldozer -0% |-1% +10%
75 # P4 +11% |+7% +8%
76 # Westmere +5% |+14% +17%
77 # Sandy Bridge +2% |+12% +29%
78 # Ivy Bridge +1% |+11% +35%
79 # Haswell(**) -0% |+12% +39%
80 # Atom +13% |+11% +4%
81 # VIA Nano +70% |+9% +25%
82 #
83 # (*) rsax engine and fips numbers are presented for reference
84 # purposes;
85 # (**) MULX was attempted, but found to give only marginal improvement;
86
87 $flavour = shift;
88 $output = shift;
89 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
90
91 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
92
93 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96 die "can't locate x86_64-xlate.pl";
97
98 open OUT,"| $^X $xlate $flavour $output";
99 *STDOUT=*OUT;
100
101 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
103 $addx = ($1>=2.23);
104 }
105
106 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
107 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
108 $addx = ($1>=2.10);
109 }
110
111 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
112 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
113 $addx = ($1>=11);
114 }
115
116 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
117 {
118 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
119
120 $code.=<<___;
121 .text
122
123 .extern OPENSSL_ia32cap_P
124
125 .globl rsaz_512_sqr
126 .type rsaz_512_sqr,\@function,5
127 .align 32
128 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
129 push %rbx
130 push %rbp
131 push %r12
132 push %r13
133 push %r14
134 push %r15
135
136 subq \$128+24, %rsp
137 .Lsqr_body:
138 movq $mod, %rbp # common argument
139 movq ($inp), %rdx
140 movq 8($inp), %rax
141 movq $n0, 128(%rsp)
142 ___
143 $code.=<<___ if ($addx);
144 movl \$0x80100,%r11d
145 andl OPENSSL_ia32cap_P+8(%rip),%r11d
146 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
147 je .Loop_sqrx
148 ___
149 $code.=<<___;
150 jmp .Loop_sqr
151
152 .align 32
153 .Loop_sqr:
154 movl $times,128+8(%rsp)
155 #first iteration
156 movq %rdx, %rbx
157 mulq %rdx
158 movq %rax, %r8
159 movq 16($inp), %rax
160 movq %rdx, %r9
161
162 mulq %rbx
163 addq %rax, %r9
164 movq 24($inp), %rax
165 movq %rdx, %r10
166 adcq \$0, %r10
167
168 mulq %rbx
169 addq %rax, %r10
170 movq 32($inp), %rax
171 movq %rdx, %r11
172 adcq \$0, %r11
173
174 mulq %rbx
175 addq %rax, %r11
176 movq 40($inp), %rax
177 movq %rdx, %r12
178 adcq \$0, %r12
179
180 mulq %rbx
181 addq %rax, %r12
182 movq 48($inp), %rax
183 movq %rdx, %r13
184 adcq \$0, %r13
185
186 mulq %rbx
187 addq %rax, %r13
188 movq 56($inp), %rax
189 movq %rdx, %r14
190 adcq \$0, %r14
191
192 mulq %rbx
193 addq %rax, %r14
194 movq %rbx, %rax
195 movq %rdx, %r15
196 adcq \$0, %r15
197
198 addq %r8, %r8 #shlq \$1, %r8
199 movq %r9, %rcx
200 adcq %r9, %r9 #shld \$1, %r8, %r9
201
202 mulq %rax
203 movq %rax, (%rsp)
204 addq %rdx, %r8
205 adcq \$0, %r9
206
207 movq %r8, 8(%rsp)
208 shrq \$63, %rcx
209
210 #second iteration
211 movq 8($inp), %r8
212 movq 16($inp), %rax
213 mulq %r8
214 addq %rax, %r10
215 movq 24($inp), %rax
216 movq %rdx, %rbx
217 adcq \$0, %rbx
218
219 mulq %r8
220 addq %rax, %r11
221 movq 32($inp), %rax
222 adcq \$0, %rdx
223 addq %rbx, %r11
224 movq %rdx, %rbx
225 adcq \$0, %rbx
226
227 mulq %r8
228 addq %rax, %r12
229 movq 40($inp), %rax
230 adcq \$0, %rdx
231 addq %rbx, %r12
232 movq %rdx, %rbx
233 adcq \$0, %rbx
234
235 mulq %r8
236 addq %rax, %r13
237 movq 48($inp), %rax
238 adcq \$0, %rdx
239 addq %rbx, %r13
240 movq %rdx, %rbx
241 adcq \$0, %rbx
242
243 mulq %r8
244 addq %rax, %r14
245 movq 56($inp), %rax
246 adcq \$0, %rdx
247 addq %rbx, %r14
248 movq %rdx, %rbx
249 adcq \$0, %rbx
250
251 mulq %r8
252 addq %rax, %r15
253 movq %r8, %rax
254 adcq \$0, %rdx
255 addq %rbx, %r15
256 movq %rdx, %r8
257 movq %r10, %rdx
258 adcq \$0, %r8
259
260 add %rdx, %rdx
261 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
262 movq %r11, %rbx
263 adcq %r11, %r11 #shld \$1, %r10, %r11
264
265 mulq %rax
266 addq %rax, %r9
267 adcq %rdx, %r10
268 adcq \$0, %r11
269
270 movq %r9, 16(%rsp)
271 movq %r10, 24(%rsp)
272 shrq \$63, %rbx
273
274 #third iteration
275 movq 16($inp), %r9
276 movq 24($inp), %rax
277 mulq %r9
278 addq %rax, %r12
279 movq 32($inp), %rax
280 movq %rdx, %rcx
281 adcq \$0, %rcx
282
283 mulq %r9
284 addq %rax, %r13
285 movq 40($inp), %rax
286 adcq \$0, %rdx
287 addq %rcx, %r13
288 movq %rdx, %rcx
289 adcq \$0, %rcx
290
291 mulq %r9
292 addq %rax, %r14
293 movq 48($inp), %rax
294 adcq \$0, %rdx
295 addq %rcx, %r14
296 movq %rdx, %rcx
297 adcq \$0, %rcx
298
299 mulq %r9
300 movq %r12, %r10
301 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
302 addq %rax, %r15
303 movq 56($inp), %rax
304 adcq \$0, %rdx
305 addq %rcx, %r15
306 movq %rdx, %rcx
307 adcq \$0, %rcx
308
309 mulq %r9
310 shrq \$63, %r10
311 addq %rax, %r8
312 movq %r9, %rax
313 adcq \$0, %rdx
314 addq %rcx, %r8
315 movq %rdx, %r9
316 adcq \$0, %r9
317
318 movq %r13, %rcx
319 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
320
321 mulq %rax
322 addq %rax, %r11
323 adcq %rdx, %r12
324 adcq \$0, %r13
325
326 movq %r11, 32(%rsp)
327 movq %r12, 40(%rsp)
328 shrq \$63, %rcx
329
330 #fourth iteration
331 movq 24($inp), %r10
332 movq 32($inp), %rax
333 mulq %r10
334 addq %rax, %r14
335 movq 40($inp), %rax
336 movq %rdx, %rbx
337 adcq \$0, %rbx
338
339 mulq %r10
340 addq %rax, %r15
341 movq 48($inp), %rax
342 adcq \$0, %rdx
343 addq %rbx, %r15
344 movq %rdx, %rbx
345 adcq \$0, %rbx
346
347 mulq %r10
348 movq %r14, %r12
349 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
350 addq %rax, %r8
351 movq 56($inp), %rax
352 adcq \$0, %rdx
353 addq %rbx, %r8
354 movq %rdx, %rbx
355 adcq \$0, %rbx
356
357 mulq %r10
358 shrq \$63, %r12
359 addq %rax, %r9
360 movq %r10, %rax
361 adcq \$0, %rdx
362 addq %rbx, %r9
363 movq %rdx, %r10
364 adcq \$0, %r10
365
366 movq %r15, %rbx
367 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
368
369 mulq %rax
370 addq %rax, %r13
371 adcq %rdx, %r14
372 adcq \$0, %r15
373
374 movq %r13, 48(%rsp)
375 movq %r14, 56(%rsp)
376 shrq \$63, %rbx
377
378 #fifth iteration
379 movq 32($inp), %r11
380 movq 40($inp), %rax
381 mulq %r11
382 addq %rax, %r8
383 movq 48($inp), %rax
384 movq %rdx, %rcx
385 adcq \$0, %rcx
386
387 mulq %r11
388 addq %rax, %r9
389 movq 56($inp), %rax
390 adcq \$0, %rdx
391 movq %r8, %r12
392 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
393 addq %rcx, %r9
394 movq %rdx, %rcx
395 adcq \$0, %rcx
396
397 mulq %r11
398 shrq \$63, %r12
399 addq %rax, %r10
400 movq %r11, %rax
401 adcq \$0, %rdx
402 addq %rcx, %r10
403 movq %rdx, %r11
404 adcq \$0, %r11
405
406 movq %r9, %rcx
407 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
408
409 mulq %rax
410 addq %rax, %r15
411 adcq %rdx, %r8
412 adcq \$0, %r9
413
414 movq %r15, 64(%rsp)
415 movq %r8, 72(%rsp)
416 shrq \$63, %rcx
417
418 #sixth iteration
419 movq 40($inp), %r12
420 movq 48($inp), %rax
421 mulq %r12
422 addq %rax, %r10
423 movq 56($inp), %rax
424 movq %rdx, %rbx
425 adcq \$0, %rbx
426
427 mulq %r12
428 addq %rax, %r11
429 movq %r12, %rax
430 movq %r10, %r15
431 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
432 adcq \$0, %rdx
433 shrq \$63, %r15
434 addq %rbx, %r11
435 movq %rdx, %r12
436 adcq \$0, %r12
437
438 movq %r11, %rbx
439 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
440
441 mulq %rax
442 addq %rax, %r9
443 adcq %rdx, %r10
444 adcq \$0, %r11
445
446 movq %r9, 80(%rsp)
447 movq %r10, 88(%rsp)
448
449 #seventh iteration
450 movq 48($inp), %r13
451 movq 56($inp), %rax
452 mulq %r13
453 addq %rax, %r12
454 movq %r13, %rax
455 movq %rdx, %r13
456 adcq \$0, %r13
457
458 xorq %r14, %r14
459 shlq \$1, %rbx
460 adcq %r12, %r12 #shld \$1, %rbx, %r12
461 adcq %r13, %r13 #shld \$1, %r12, %r13
462 adcq %r14, %r14 #shld \$1, %r13, %r14
463
464 mulq %rax
465 addq %rax, %r11
466 adcq %rdx, %r12
467 adcq \$0, %r13
468
469 movq %r11, 96(%rsp)
470 movq %r12, 104(%rsp)
471
472 #eighth iteration
473 movq 56($inp), %rax
474 mulq %rax
475 addq %rax, %r13
476 adcq \$0, %rdx
477
478 addq %rdx, %r14
479
480 movq %r13, 112(%rsp)
481 movq %r14, 120(%rsp)
482
483 movq (%rsp), %r8
484 movq 8(%rsp), %r9
485 movq 16(%rsp), %r10
486 movq 24(%rsp), %r11
487 movq 32(%rsp), %r12
488 movq 40(%rsp), %r13
489 movq 48(%rsp), %r14
490 movq 56(%rsp), %r15
491
492 call __rsaz_512_reduce
493
494 addq 64(%rsp), %r8
495 adcq 72(%rsp), %r9
496 adcq 80(%rsp), %r10
497 adcq 88(%rsp), %r11
498 adcq 96(%rsp), %r12
499 adcq 104(%rsp), %r13
500 adcq 112(%rsp), %r14
501 adcq 120(%rsp), %r15
502 sbbq %rcx, %rcx
503
504 call __rsaz_512_subtract
505
506 movq %r8, %rdx
507 movq %r9, %rax
508 movl 128+8(%rsp), $times
509 movq $out, $inp
510
511 decl $times
512 jnz .Loop_sqr
513 ___
514 if ($addx) {
515 $code.=<<___;
516 jmp .Lsqr_tail
517
518 .align 32
519 .Loop_sqrx:
520 movl $times,128+8(%rsp)
521 movq $out, %xmm0 # off-load
522 movq %rbp, %xmm1 # off-load
523 #first iteration
524 mulx %rax, %r8, %r9
525
526 mulx 16($inp), %rcx, %r10
527 xor %rbp, %rbp # cf=0, of=0
528
529 mulx 24($inp), %rax, %r11
530 adcx %rcx, %r9
531
532 mulx 32($inp), %rcx, %r12
533 adcx %rax, %r10
534
535 mulx 40($inp), %rax, %r13
536 adcx %rcx, %r11
537
538 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
539 adcx %rax, %r12
540 adcx %rcx, %r13
541
542 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
543 adcx %rax, %r14
544 adcx %rbp, %r15 # %rbp is 0
545
546 mov %r9, %rcx
547 shld \$1, %r8, %r9
548 shl \$1, %r8
549
550 xor %ebp, %ebp
551 mulx %rdx, %rax, %rdx
552 adcx %rdx, %r8
553 mov 8($inp), %rdx
554 adcx %rbp, %r9
555
556 mov %rax, (%rsp)
557 mov %r8, 8(%rsp)
558
559 #second iteration
560 mulx 16($inp), %rax, %rbx
561 adox %rax, %r10
562 adcx %rbx, %r11
563
564 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
565 adox $out, %r11
566 adcx %r8, %r12
567
568 mulx 32($inp), %rax, %rbx
569 adox %rax, %r12
570 adcx %rbx, %r13
571
572 mulx 40($inp), $out, %r8
573 adox $out, %r13
574 adcx %r8, %r14
575
576 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
577 adox %rax, %r14
578 adcx %rbx, %r15
579
580 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
581 adox $out, %r15
582 adcx %rbp, %r8
583 adox %rbp, %r8
584
585 mov %r11, %rbx
586 shld \$1, %r10, %r11
587 shld \$1, %rcx, %r10
588
589 xor %ebp,%ebp
590 mulx %rdx, %rax, %rcx
591 mov 16($inp), %rdx
592 adcx %rax, %r9
593 adcx %rcx, %r10
594 adcx %rbp, %r11
595
596 mov %r9, 16(%rsp)
597 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
598
599 #third iteration
600 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
601 adox $out, %r12
602 adcx %r9, %r13
603
604 mulx 32($inp), %rax, %rcx
605 adox %rax, %r13
606 adcx %rcx, %r14
607
608 mulx 40($inp), $out, %r9
609 adox $out, %r14
610 adcx %r9, %r15
611
612 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
613 adox %rax, %r15
614 adcx %rcx, %r8
615
616 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
617 adox $out, %r8
618 adcx %rbp, %r9
619 adox %rbp, %r9
620
621 mov %r13, %rcx
622 shld \$1, %r12, %r13
623 shld \$1, %rbx, %r12
624
625 xor %ebp, %ebp
626 mulx %rdx, %rax, %rdx
627 adcx %rax, %r11
628 adcx %rdx, %r12
629 mov 24($inp), %rdx
630 adcx %rbp, %r13
631
632 mov %r11, 32(%rsp)
633 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
634
635 #fourth iteration
636 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
637 adox %rax, %r14
638 adcx %rbx, %r15
639
640 mulx 40($inp), $out, %r10
641 adox $out, %r15
642 adcx %r10, %r8
643
644 mulx 48($inp), %rax, %rbx
645 adox %rax, %r8
646 adcx %rbx, %r9
647
648 mulx 56($inp), $out, %r10
649 adox $out, %r9
650 adcx %rbp, %r10
651 adox %rbp, %r10
652
653 .byte 0x66
654 mov %r15, %rbx
655 shld \$1, %r14, %r15
656 shld \$1, %rcx, %r14
657
658 xor %ebp, %ebp
659 mulx %rdx, %rax, %rdx
660 adcx %rax, %r13
661 adcx %rdx, %r14
662 mov 32($inp), %rdx
663 adcx %rbp, %r15
664
665 mov %r13, 48(%rsp)
666 mov %r14, 56(%rsp)
667
668 #fifth iteration
669 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
670 adox $out, %r8
671 adcx %r11, %r9
672
673 mulx 48($inp), %rax, %rcx
674 adox %rax, %r9
675 adcx %rcx, %r10
676
677 mulx 56($inp), $out, %r11
678 adox $out, %r10
679 adcx %rbp, %r11
680 adox %rbp, %r11
681
682 mov %r9, %rcx
683 shld \$1, %r8, %r9
684 shld \$1, %rbx, %r8
685
686 xor %ebp, %ebp
687 mulx %rdx, %rax, %rdx
688 adcx %rax, %r15
689 adcx %rdx, %r8
690 mov 40($inp), %rdx
691 adcx %rbp, %r9
692
693 mov %r15, 64(%rsp)
694 mov %r8, 72(%rsp)
695
696 #sixth iteration
697 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
698 adox %rax, %r10
699 adcx %rbx, %r11
700
701 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
702 adox $out, %r11
703 adcx %rbp, %r12
704 adox %rbp, %r12
705
706 mov %r11, %rbx
707 shld \$1, %r10, %r11
708 shld \$1, %rcx, %r10
709
710 xor %ebp, %ebp
711 mulx %rdx, %rax, %rdx
712 adcx %rax, %r9
713 adcx %rdx, %r10
714 mov 48($inp), %rdx
715 adcx %rbp, %r11
716
717 mov %r9, 80(%rsp)
718 mov %r10, 88(%rsp)
719
720 #seventh iteration
721 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
722 adox %rax, %r12
723 adox %rbp, %r13
724
725 xor %r14, %r14
726 shld \$1, %r13, %r14
727 shld \$1, %r12, %r13
728 shld \$1, %rbx, %r12
729
730 xor %ebp, %ebp
731 mulx %rdx, %rax, %rdx
732 adcx %rax, %r11
733 adcx %rdx, %r12
734 mov 56($inp), %rdx
735 adcx %rbp, %r13
736
737 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
738 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
739
740 #eighth iteration
741 mulx %rdx, %rax, %rdx
742 adox %rax, %r13
743 adox %rbp, %rdx
744
745 .byte 0x66
746 add %rdx, %r14
747
748 movq %r13, 112(%rsp)
749 movq %r14, 120(%rsp)
750 movq %xmm0, $out
751 movq %xmm1, %rbp
752
753 movq 128(%rsp), %rdx # pull $n0
754 movq (%rsp), %r8
755 movq 8(%rsp), %r9
756 movq 16(%rsp), %r10
757 movq 24(%rsp), %r11
758 movq 32(%rsp), %r12
759 movq 40(%rsp), %r13
760 movq 48(%rsp), %r14
761 movq 56(%rsp), %r15
762
763 call __rsaz_512_reducex
764
765 addq 64(%rsp), %r8
766 adcq 72(%rsp), %r9
767 adcq 80(%rsp), %r10
768 adcq 88(%rsp), %r11
769 adcq 96(%rsp), %r12
770 adcq 104(%rsp), %r13
771 adcq 112(%rsp), %r14
772 adcq 120(%rsp), %r15
773 sbbq %rcx, %rcx
774
775 call __rsaz_512_subtract
776
777 movq %r8, %rdx
778 movq %r9, %rax
779 movl 128+8(%rsp), $times
780 movq $out, $inp
781
782 decl $times
783 jnz .Loop_sqrx
784
785 .Lsqr_tail:
786 ___
787 }
788 $code.=<<___;
789
790 leaq 128+24+48(%rsp), %rax
791 movq -48(%rax), %r15
792 movq -40(%rax), %r14
793 movq -32(%rax), %r13
794 movq -24(%rax), %r12
795 movq -16(%rax), %rbp
796 movq -8(%rax), %rbx
797 leaq (%rax), %rsp
798 .Lsqr_epilogue:
799 ret
800 .size rsaz_512_sqr,.-rsaz_512_sqr
801 ___
802 }
803 {
804 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
805 $code.=<<___;
806 .globl rsaz_512_mul
807 .type rsaz_512_mul,\@function,5
808 .align 32
809 rsaz_512_mul:
810 push %rbx
811 push %rbp
812 push %r12
813 push %r13
814 push %r14
815 push %r15
816
817 subq \$128+24, %rsp
818 .Lmul_body:
819 movq $out, %xmm0 # off-load arguments
820 movq $mod, %xmm1
821 movq $n0, 128(%rsp)
822 ___
823 $code.=<<___ if ($addx);
824 movl \$0x80100,%r11d
825 andl OPENSSL_ia32cap_P+8(%rip),%r11d
826 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
827 je .Lmulx
828 ___
829 $code.=<<___;
830 movq ($bp), %rbx # pass b[0]
831 movq $bp, %rbp # pass argument
832 call __rsaz_512_mul
833
834 movq %xmm0, $out
835 movq %xmm1, %rbp
836
837 movq (%rsp), %r8
838 movq 8(%rsp), %r9
839 movq 16(%rsp), %r10
840 movq 24(%rsp), %r11
841 movq 32(%rsp), %r12
842 movq 40(%rsp), %r13
843 movq 48(%rsp), %r14
844 movq 56(%rsp), %r15
845
846 call __rsaz_512_reduce
847 ___
848 $code.=<<___ if ($addx);
849 jmp .Lmul_tail
850
851 .align 32
852 .Lmulx:
853 movq $bp, %rbp # pass argument
854 movq ($bp), %rdx # pass b[0]
855 call __rsaz_512_mulx
856
857 movq %xmm0, $out
858 movq %xmm1, %rbp
859
860 movq 128(%rsp), %rdx # pull $n0
861 movq (%rsp), %r8
862 movq 8(%rsp), %r9
863 movq 16(%rsp), %r10
864 movq 24(%rsp), %r11
865 movq 32(%rsp), %r12
866 movq 40(%rsp), %r13
867 movq 48(%rsp), %r14
868 movq 56(%rsp), %r15
869
870 call __rsaz_512_reducex
871 .Lmul_tail:
872 ___
873 $code.=<<___;
874 addq 64(%rsp), %r8
875 adcq 72(%rsp), %r9
876 adcq 80(%rsp), %r10
877 adcq 88(%rsp), %r11
878 adcq 96(%rsp), %r12
879 adcq 104(%rsp), %r13
880 adcq 112(%rsp), %r14
881 adcq 120(%rsp), %r15
882 sbbq %rcx, %rcx
883
884 call __rsaz_512_subtract
885
886 leaq 128+24+48(%rsp), %rax
887 movq -48(%rax), %r15
888 movq -40(%rax), %r14
889 movq -32(%rax), %r13
890 movq -24(%rax), %r12
891 movq -16(%rax), %rbp
892 movq -8(%rax), %rbx
893 leaq (%rax), %rsp
894 .Lmul_epilogue:
895 ret
896 .size rsaz_512_mul,.-rsaz_512_mul
897 ___
898 }
899 {
900 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
901 $code.=<<___;
902 .globl rsaz_512_mul_gather4
903 .type rsaz_512_mul_gather4,\@function,6
904 .align 32
905 rsaz_512_mul_gather4:
906 push %rbx
907 push %rbp
908 push %r12
909 push %r13
910 push %r14
911 push %r15
912
913 mov $pwr, $pwr
914 subq \$128+24, %rsp
915 .Lmul_gather4_body:
916 ___
917 $code.=<<___ if ($addx);
918 movl \$0x80100,%r11d
919 andl OPENSSL_ia32cap_P+8(%rip),%r11d
920 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
921 je .Lmulx_gather
922 ___
923 $code.=<<___;
924 movl 64($bp,$pwr,4), %eax
925 movq $out, %xmm0 # off-load arguments
926 movl ($bp,$pwr,4), %ebx
927 movq $mod, %xmm1
928 movq $n0, 128(%rsp)
929
930 shlq \$32, %rax
931 or %rax, %rbx
932 movq ($ap), %rax
933 movq 8($ap), %rcx
934 leaq 128($bp,$pwr,4), %rbp
935 mulq %rbx # 0 iteration
936 movq %rax, (%rsp)
937 movq %rcx, %rax
938 movq %rdx, %r8
939
940 mulq %rbx
941 movd (%rbp), %xmm4
942 addq %rax, %r8
943 movq 16($ap), %rax
944 movq %rdx, %r9
945 adcq \$0, %r9
946
947 mulq %rbx
948 movd 64(%rbp), %xmm5
949 addq %rax, %r9
950 movq 24($ap), %rax
951 movq %rdx, %r10
952 adcq \$0, %r10
953
954 mulq %rbx
955 pslldq \$4, %xmm5
956 addq %rax, %r10
957 movq 32($ap), %rax
958 movq %rdx, %r11
959 adcq \$0, %r11
960
961 mulq %rbx
962 por %xmm5, %xmm4
963 addq %rax, %r11
964 movq 40($ap), %rax
965 movq %rdx, %r12
966 adcq \$0, %r12
967
968 mulq %rbx
969 addq %rax, %r12
970 movq 48($ap), %rax
971 movq %rdx, %r13
972 adcq \$0, %r13
973
974 mulq %rbx
975 leaq 128(%rbp), %rbp
976 addq %rax, %r13
977 movq 56($ap), %rax
978 movq %rdx, %r14
979 adcq \$0, %r14
980
981 mulq %rbx
982 movq %xmm4, %rbx
983 addq %rax, %r14
984 movq ($ap), %rax
985 movq %rdx, %r15
986 adcq \$0, %r15
987
988 leaq 8(%rsp), %rdi
989 movl \$7, %ecx
990 jmp .Loop_mul_gather
991
992 .align 32
993 .Loop_mul_gather:
994 mulq %rbx
995 addq %rax, %r8
996 movq 8($ap), %rax
997 movq %r8, (%rdi)
998 movq %rdx, %r8
999 adcq \$0, %r8
1000
1001 mulq %rbx
1002 movd (%rbp), %xmm4
1003 addq %rax, %r9
1004 movq 16($ap), %rax
1005 adcq \$0, %rdx
1006 addq %r9, %r8
1007 movq %rdx, %r9
1008 adcq \$0, %r9
1009
1010 mulq %rbx
1011 movd 64(%rbp), %xmm5
1012 addq %rax, %r10
1013 movq 24($ap), %rax
1014 adcq \$0, %rdx
1015 addq %r10, %r9
1016 movq %rdx, %r10
1017 adcq \$0, %r10
1018
1019 mulq %rbx
1020 pslldq \$4, %xmm5
1021 addq %rax, %r11
1022 movq 32($ap), %rax
1023 adcq \$0, %rdx
1024 addq %r11, %r10
1025 movq %rdx, %r11
1026 adcq \$0, %r11
1027
1028 mulq %rbx
1029 por %xmm5, %xmm4
1030 addq %rax, %r12
1031 movq 40($ap), %rax
1032 adcq \$0, %rdx
1033 addq %r12, %r11
1034 movq %rdx, %r12
1035 adcq \$0, %r12
1036
1037 mulq %rbx
1038 addq %rax, %r13
1039 movq 48($ap), %rax
1040 adcq \$0, %rdx
1041 addq %r13, %r12
1042 movq %rdx, %r13
1043 adcq \$0, %r13
1044
1045 mulq %rbx
1046 addq %rax, %r14
1047 movq 56($ap), %rax
1048 adcq \$0, %rdx
1049 addq %r14, %r13
1050 movq %rdx, %r14
1051 adcq \$0, %r14
1052
1053 mulq %rbx
1054 movq %xmm4, %rbx
1055 addq %rax, %r15
1056 movq ($ap), %rax
1057 adcq \$0, %rdx
1058 addq %r15, %r14
1059 movq %rdx, %r15
1060 adcq \$0, %r15
1061
1062 leaq 128(%rbp), %rbp
1063 leaq 8(%rdi), %rdi
1064
1065 decl %ecx
1066 jnz .Loop_mul_gather
1067
1068 movq %r8, (%rdi)
1069 movq %r9, 8(%rdi)
1070 movq %r10, 16(%rdi)
1071 movq %r11, 24(%rdi)
1072 movq %r12, 32(%rdi)
1073 movq %r13, 40(%rdi)
1074 movq %r14, 48(%rdi)
1075 movq %r15, 56(%rdi)
1076
1077 movq %xmm0, $out
1078 movq %xmm1, %rbp
1079
1080 movq (%rsp), %r8
1081 movq 8(%rsp), %r9
1082 movq 16(%rsp), %r10
1083 movq 24(%rsp), %r11
1084 movq 32(%rsp), %r12
1085 movq 40(%rsp), %r13
1086 movq 48(%rsp), %r14
1087 movq 56(%rsp), %r15
1088
1089 call __rsaz_512_reduce
1090 ___
1091 $code.=<<___ if ($addx);
1092 jmp .Lmul_gather_tail
1093
1094 .align 32
1095 .Lmulx_gather:
1096 mov 64($bp,$pwr,4), %eax
1097 movq $out, %xmm0 # off-load arguments
1098 lea 128($bp,$pwr,4), %rbp
1099 mov ($bp,$pwr,4), %edx
1100 movq $mod, %xmm1
1101 mov $n0, 128(%rsp)
1102
1103 shl \$32, %rax
1104 or %rax, %rdx
1105 mulx ($ap), %rbx, %r8 # 0 iteration
1106 mov %rbx, (%rsp)
1107 xor %edi, %edi # cf=0, of=0
1108
1109 mulx 8($ap), %rax, %r9
1110 movd (%rbp), %xmm4
1111
1112 mulx 16($ap), %rbx, %r10
1113 movd 64(%rbp), %xmm5
1114 adcx %rax, %r8
1115
1116 mulx 24($ap), %rax, %r11
1117 pslldq \$4, %xmm5
1118 adcx %rbx, %r9
1119
1120 mulx 32($ap), %rbx, %r12
1121 por %xmm5, %xmm4
1122 adcx %rax, %r10
1123
1124 mulx 40($ap), %rax, %r13
1125 adcx %rbx, %r11
1126
1127 mulx 48($ap), %rbx, %r14
1128 lea 128(%rbp), %rbp
1129 adcx %rax, %r12
1130
1131 mulx 56($ap), %rax, %r15
1132 movq %xmm4, %rdx
1133 adcx %rbx, %r13
1134 adcx %rax, %r14
1135 mov %r8, %rbx
1136 adcx %rdi, %r15 # %rdi is 0
1137
1138 mov \$-7, %rcx
1139 jmp .Loop_mulx_gather
1140
1141 .align 32
1142 .Loop_mulx_gather:
1143 mulx ($ap), %rax, %r8
1144 adcx %rax, %rbx
1145 adox %r9, %r8
1146
1147 mulx 8($ap), %rax, %r9
1148 .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
1149 adcx %rax, %r8
1150 adox %r10, %r9
1151
1152 mulx 16($ap), %rax, %r10
1153 movd 64(%rbp), %xmm5
1154 lea 128(%rbp), %rbp
1155 adcx %rax, %r9
1156 adox %r11, %r10
1157
1158 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1159 pslldq \$4, %xmm5
1160 por %xmm5, %xmm4
1161 adcx %rax, %r10
1162 adox %r12, %r11
1163
1164 mulx 32($ap), %rax, %r12
1165 adcx %rax, %r11
1166 adox %r13, %r12
1167
1168 mulx 40($ap), %rax, %r13
1169 adcx %rax, %r12
1170 adox %r14, %r13
1171
1172 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1173 adcx %rax, %r13
1174 adox %r15, %r14
1175
1176 mulx 56($ap), %rax, %r15
1177 movq %xmm4, %rdx
1178 mov %rbx, 64(%rsp,%rcx,8)
1179 adcx %rax, %r14
1180 adox %rdi, %r15
1181 mov %r8, %rbx
1182 adcx %rdi, %r15 # cf=0
1183
1184 inc %rcx # of=0
1185 jnz .Loop_mulx_gather
1186
1187 mov %r8, 64(%rsp)
1188 mov %r9, 64+8(%rsp)
1189 mov %r10, 64+16(%rsp)
1190 mov %r11, 64+24(%rsp)
1191 mov %r12, 64+32(%rsp)
1192 mov %r13, 64+40(%rsp)
1193 mov %r14, 64+48(%rsp)
1194 mov %r15, 64+56(%rsp)
1195
1196 movq %xmm0, $out
1197 movq %xmm1, %rbp
1198
1199 mov 128(%rsp), %rdx # pull $n0
1200 mov (%rsp), %r8
1201 mov 8(%rsp), %r9
1202 mov 16(%rsp), %r10
1203 mov 24(%rsp), %r11
1204 mov 32(%rsp), %r12
1205 mov 40(%rsp), %r13
1206 mov 48(%rsp), %r14
1207 mov 56(%rsp), %r15
1208
1209 call __rsaz_512_reducex
1210
1211 .Lmul_gather_tail:
1212 ___
1213 $code.=<<___;
1214 addq 64(%rsp), %r8
1215 adcq 72(%rsp), %r9
1216 adcq 80(%rsp), %r10
1217 adcq 88(%rsp), %r11
1218 adcq 96(%rsp), %r12
1219 adcq 104(%rsp), %r13
1220 adcq 112(%rsp), %r14
1221 adcq 120(%rsp), %r15
1222 sbbq %rcx, %rcx
1223
1224 call __rsaz_512_subtract
1225
1226 leaq 128+24+48(%rsp), %rax
1227 movq -48(%rax), %r15
1228 movq -40(%rax), %r14
1229 movq -32(%rax), %r13
1230 movq -24(%rax), %r12
1231 movq -16(%rax), %rbp
1232 movq -8(%rax), %rbx
1233 leaq (%rax), %rsp
1234 .Lmul_gather4_epilogue:
1235 ret
1236 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1237 ___
1238 }
1239 {
1240 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1241 $code.=<<___;
1242 .globl rsaz_512_mul_scatter4
1243 .type rsaz_512_mul_scatter4,\@function,6
1244 .align 32
1245 rsaz_512_mul_scatter4:
1246 push %rbx
1247 push %rbp
1248 push %r12
1249 push %r13
1250 push %r14
1251 push %r15
1252
1253 mov $pwr, $pwr
1254 subq \$128+24, %rsp
1255 .Lmul_scatter4_body:
1256 leaq ($tbl,$pwr,4), $tbl
1257 movq $out, %xmm0 # off-load arguments
1258 movq $mod, %xmm1
1259 movq $tbl, %xmm2
1260 movq $n0, 128(%rsp)
1261
1262 movq $out, %rbp
1263 ___
1264 $code.=<<___ if ($addx);
1265 movl \$0x80100,%r11d
1266 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1267 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1268 je .Lmulx_scatter
1269 ___
1270 $code.=<<___;
1271 movq ($out),%rbx # pass b[0]
1272 call __rsaz_512_mul
1273
1274 movq %xmm0, $out
1275 movq %xmm1, %rbp
1276
1277 movq (%rsp), %r8
1278 movq 8(%rsp), %r9
1279 movq 16(%rsp), %r10
1280 movq 24(%rsp), %r11
1281 movq 32(%rsp), %r12
1282 movq 40(%rsp), %r13
1283 movq 48(%rsp), %r14
1284 movq 56(%rsp), %r15
1285
1286 call __rsaz_512_reduce
1287 ___
1288 $code.=<<___ if ($addx);
1289 jmp .Lmul_scatter_tail
1290
1291 .align 32
1292 .Lmulx_scatter:
1293 movq ($out), %rdx # pass b[0]
1294 call __rsaz_512_mulx
1295
1296 movq %xmm0, $out
1297 movq %xmm1, %rbp
1298
1299 movq 128(%rsp), %rdx # pull $n0
1300 movq (%rsp), %r8
1301 movq 8(%rsp), %r9
1302 movq 16(%rsp), %r10
1303 movq 24(%rsp), %r11
1304 movq 32(%rsp), %r12
1305 movq 40(%rsp), %r13
1306 movq 48(%rsp), %r14
1307 movq 56(%rsp), %r15
1308
1309 call __rsaz_512_reducex
1310
1311 .Lmul_scatter_tail:
1312 ___
1313 $code.=<<___;
1314 addq 64(%rsp), %r8
1315 adcq 72(%rsp), %r9
1316 adcq 80(%rsp), %r10
1317 adcq 88(%rsp), %r11
1318 adcq 96(%rsp), %r12
1319 adcq 104(%rsp), %r13
1320 adcq 112(%rsp), %r14
1321 adcq 120(%rsp), %r15
1322 movq %xmm2, $inp
1323 sbbq %rcx, %rcx
1324
1325 call __rsaz_512_subtract
1326
1327 movl %r8d, 64*0($inp) # scatter
1328 shrq \$32, %r8
1329 movl %r9d, 64*2($inp)
1330 shrq \$32, %r9
1331 movl %r10d, 64*4($inp)
1332 shrq \$32, %r10
1333 movl %r11d, 64*6($inp)
1334 shrq \$32, %r11
1335 movl %r12d, 64*8($inp)
1336 shrq \$32, %r12
1337 movl %r13d, 64*10($inp)
1338 shrq \$32, %r13
1339 movl %r14d, 64*12($inp)
1340 shrq \$32, %r14
1341 movl %r15d, 64*14($inp)
1342 shrq \$32, %r15
1343 movl %r8d, 64*1($inp)
1344 movl %r9d, 64*3($inp)
1345 movl %r10d, 64*5($inp)
1346 movl %r11d, 64*7($inp)
1347 movl %r12d, 64*9($inp)
1348 movl %r13d, 64*11($inp)
1349 movl %r14d, 64*13($inp)
1350 movl %r15d, 64*15($inp)
1351
1352 leaq 128+24+48(%rsp), %rax
1353 movq -48(%rax), %r15
1354 movq -40(%rax), %r14
1355 movq -32(%rax), %r13
1356 movq -24(%rax), %r12
1357 movq -16(%rax), %rbp
1358 movq -8(%rax), %rbx
1359 leaq (%rax), %rsp
1360 .Lmul_scatter4_epilogue:
1361 ret
1362 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1363 ___
1364 }
1365 {
1366 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1367 $code.=<<___;
1368 .globl rsaz_512_mul_by_one
1369 .type rsaz_512_mul_by_one,\@function,4
1370 .align 32
1371 rsaz_512_mul_by_one:
1372 push %rbx
1373 push %rbp
1374 push %r12
1375 push %r13
1376 push %r14
1377 push %r15
1378
1379 subq \$128+24, %rsp
1380 .Lmul_by_one_body:
1381 ___
1382 $code.=<<___ if ($addx);
1383 movl OPENSSL_ia32cap_P+8(%rip),%eax
1384 ___
1385 $code.=<<___;
1386 movq $mod, %rbp # reassign argument
1387 movq $n0, 128(%rsp)
1388
1389 movq ($inp), %r8
1390 pxor %xmm0, %xmm0
1391 movq 8($inp), %r9
1392 movq 16($inp), %r10
1393 movq 24($inp), %r11
1394 movq 32($inp), %r12
1395 movq 40($inp), %r13
1396 movq 48($inp), %r14
1397 movq 56($inp), %r15
1398
1399 movdqa %xmm0, (%rsp)
1400 movdqa %xmm0, 16(%rsp)
1401 movdqa %xmm0, 32(%rsp)
1402 movdqa %xmm0, 48(%rsp)
1403 movdqa %xmm0, 64(%rsp)
1404 movdqa %xmm0, 80(%rsp)
1405 movdqa %xmm0, 96(%rsp)
1406 ___
1407 $code.=<<___ if ($addx);
1408 andl \$0x80100,%eax
1409 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1410 je .Lby_one_callx
1411 ___
1412 $code.=<<___;
1413 call __rsaz_512_reduce
1414 ___
1415 $code.=<<___ if ($addx);
1416 jmp .Lby_one_tail
1417 .align 32
1418 .Lby_one_callx:
1419 movq 128(%rsp), %rdx # pull $n0
1420 call __rsaz_512_reducex
1421 .Lby_one_tail:
1422 ___
1423 $code.=<<___;
1424 movq %r8, ($out)
1425 movq %r9, 8($out)
1426 movq %r10, 16($out)
1427 movq %r11, 24($out)
1428 movq %r12, 32($out)
1429 movq %r13, 40($out)
1430 movq %r14, 48($out)
1431 movq %r15, 56($out)
1432
1433 leaq 128+24+48(%rsp), %rax
1434 movq -48(%rax), %r15
1435 movq -40(%rax), %r14
1436 movq -32(%rax), %r13
1437 movq -24(%rax), %r12
1438 movq -16(%rax), %rbp
1439 movq -8(%rax), %rbx
1440 leaq (%rax), %rsp
1441 .Lmul_by_one_epilogue:
1442 ret
1443 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1444 ___
1445 }
1446 { # __rsaz_512_reduce
1447 #
1448 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1449 # output: %r8-%r15
1450 # clobbers: everything except %rbp and %rdi
1451 $code.=<<___;
1452 .type __rsaz_512_reduce,\@abi-omnipotent
1453 .align 32
1454 __rsaz_512_reduce:
1455 movq %r8, %rbx
1456 imulq 128+8(%rsp), %rbx
1457 movq 0(%rbp), %rax
1458 movl \$8, %ecx
1459 jmp .Lreduction_loop
1460
1461 .align 32
1462 .Lreduction_loop:
1463 mulq %rbx
1464 movq 8(%rbp), %rax
1465 negq %r8
1466 movq %rdx, %r8
1467 adcq \$0, %r8
1468
1469 mulq %rbx
1470 addq %rax, %r9
1471 movq 16(%rbp), %rax
1472 adcq \$0, %rdx
1473 addq %r9, %r8
1474 movq %rdx, %r9
1475 adcq \$0, %r9
1476
1477 mulq %rbx
1478 addq %rax, %r10
1479 movq 24(%rbp), %rax
1480 adcq \$0, %rdx
1481 addq %r10, %r9
1482 movq %rdx, %r10
1483 adcq \$0, %r10
1484
1485 mulq %rbx
1486 addq %rax, %r11
1487 movq 32(%rbp), %rax
1488 adcq \$0, %rdx
1489 addq %r11, %r10
1490 movq 128+8(%rsp), %rsi
1491 #movq %rdx, %r11
1492 #adcq \$0, %r11
1493 adcq \$0, %rdx
1494 movq %rdx, %r11
1495
1496 mulq %rbx
1497 addq %rax, %r12
1498 movq 40(%rbp), %rax
1499 adcq \$0, %rdx
1500 imulq %r8, %rsi
1501 addq %r12, %r11
1502 movq %rdx, %r12
1503 adcq \$0, %r12
1504
1505 mulq %rbx
1506 addq %rax, %r13
1507 movq 48(%rbp), %rax
1508 adcq \$0, %rdx
1509 addq %r13, %r12
1510 movq %rdx, %r13
1511 adcq \$0, %r13
1512
1513 mulq %rbx
1514 addq %rax, %r14
1515 movq 56(%rbp), %rax
1516 adcq \$0, %rdx
1517 addq %r14, %r13
1518 movq %rdx, %r14
1519 adcq \$0, %r14
1520
1521 mulq %rbx
1522 movq %rsi, %rbx
1523 addq %rax, %r15
1524 movq 0(%rbp), %rax
1525 adcq \$0, %rdx
1526 addq %r15, %r14
1527 movq %rdx, %r15
1528 adcq \$0, %r15
1529
1530 decl %ecx
1531 jne .Lreduction_loop
1532
1533 ret
1534 .size __rsaz_512_reduce,.-__rsaz_512_reduce
1535 ___
1536 }
1537 if ($addx) {
1538 # __rsaz_512_reducex
1539 #
1540 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1541 # output: %r8-%r15
1542 # clobbers: everything except %rbp and %rdi
1543 $code.=<<___;
1544 .type __rsaz_512_reducex,\@abi-omnipotent
1545 .align 32
1546 __rsaz_512_reducex:
1547 #movq 128+8(%rsp), %rdx # pull $n0
1548 imulq %r8, %rdx
1549 xorq %rsi, %rsi # cf=0,of=0
1550 movl \$8, %ecx
1551 jmp .Lreduction_loopx
1552
1553 .align 32
1554 .Lreduction_loopx:
1555 mov %r8, %rbx
1556 mulx 0(%rbp), %rax, %r8
1557 adcx %rbx, %rax
1558 adox %r9, %r8
1559
1560 mulx 8(%rbp), %rax, %r9
1561 adcx %rax, %r8
1562 adox %r10, %r9
1563
1564 mulx 16(%rbp), %rbx, %r10
1565 adcx %rbx, %r9
1566 adox %r11, %r10
1567
1568 mulx 24(%rbp), %rbx, %r11
1569 adcx %rbx, %r10
1570 adox %r12, %r11
1571
1572 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1573 mov %rdx, %rax
1574 mov %r8, %rdx
1575 adcx %rbx, %r11
1576 adox %r13, %r12
1577
1578 mulx 128+8(%rsp), %rbx, %rdx
1579 mov %rax, %rdx
1580
1581 mulx 40(%rbp), %rax, %r13
1582 adcx %rax, %r12
1583 adox %r14, %r13
1584
1585 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1586 adcx %rax, %r13
1587 adox %r15, %r14
1588
1589 mulx 56(%rbp), %rax, %r15
1590 mov %rbx, %rdx
1591 adcx %rax, %r14
1592 adox %rsi, %r15 # %rsi is 0
1593 adcx %rsi, %r15 # cf=0
1594
1595 decl %ecx # of=0
1596 jne .Lreduction_loopx
1597
1598 ret
1599 .size __rsaz_512_reducex,.-__rsaz_512_reducex
1600 ___
1601 }
1602 { # __rsaz_512_subtract
1603 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1604 # output:
1605 # clobbers: everything but %rdi, %rsi and %rbp
1606 $code.=<<___;
1607 .type __rsaz_512_subtract,\@abi-omnipotent
1608 .align 32
1609 __rsaz_512_subtract:
1610 movq %r8, ($out)
1611 movq %r9, 8($out)
1612 movq %r10, 16($out)
1613 movq %r11, 24($out)
1614 movq %r12, 32($out)
1615 movq %r13, 40($out)
1616 movq %r14, 48($out)
1617 movq %r15, 56($out)
1618
1619 movq 0($mod), %r8
1620 movq 8($mod), %r9
1621 negq %r8
1622 notq %r9
1623 andq %rcx, %r8
1624 movq 16($mod), %r10
1625 andq %rcx, %r9
1626 notq %r10
1627 movq 24($mod), %r11
1628 andq %rcx, %r10
1629 notq %r11
1630 movq 32($mod), %r12
1631 andq %rcx, %r11
1632 notq %r12
1633 movq 40($mod), %r13
1634 andq %rcx, %r12
1635 notq %r13
1636 movq 48($mod), %r14
1637 andq %rcx, %r13
1638 notq %r14
1639 movq 56($mod), %r15
1640 andq %rcx, %r14
1641 notq %r15
1642 andq %rcx, %r15
1643
1644 addq ($out), %r8
1645 adcq 8($out), %r9
1646 adcq 16($out), %r10
1647 adcq 24($out), %r11
1648 adcq 32($out), %r12
1649 adcq 40($out), %r13
1650 adcq 48($out), %r14
1651 adcq 56($out), %r15
1652
1653 movq %r8, ($out)
1654 movq %r9, 8($out)
1655 movq %r10, 16($out)
1656 movq %r11, 24($out)
1657 movq %r12, 32($out)
1658 movq %r13, 40($out)
1659 movq %r14, 48($out)
1660 movq %r15, 56($out)
1661
1662 ret
1663 .size __rsaz_512_subtract,.-__rsaz_512_subtract
1664 ___
1665 }
1666 { # __rsaz_512_mul
1667 #
1668 # input: %rsi - ap, %rbp - bp
1669 # ouput:
1670 # clobbers: everything
1671 my ($ap,$bp) = ("%rsi","%rbp");
1672 $code.=<<___;
1673 .type __rsaz_512_mul,\@abi-omnipotent
1674 .align 32
1675 __rsaz_512_mul:
1676 leaq 8(%rsp), %rdi
1677
1678 movq ($ap), %rax
1679 mulq %rbx
1680 movq %rax, (%rdi)
1681 movq 8($ap), %rax
1682 movq %rdx, %r8
1683
1684 mulq %rbx
1685 addq %rax, %r8
1686 movq 16($ap), %rax
1687 movq %rdx, %r9
1688 adcq \$0, %r9
1689
1690 mulq %rbx
1691 addq %rax, %r9
1692 movq 24($ap), %rax
1693 movq %rdx, %r10
1694 adcq \$0, %r10
1695
1696 mulq %rbx
1697 addq %rax, %r10
1698 movq 32($ap), %rax
1699 movq %rdx, %r11
1700 adcq \$0, %r11
1701
1702 mulq %rbx
1703 addq %rax, %r11
1704 movq 40($ap), %rax
1705 movq %rdx, %r12
1706 adcq \$0, %r12
1707
1708 mulq %rbx
1709 addq %rax, %r12
1710 movq 48($ap), %rax
1711 movq %rdx, %r13
1712 adcq \$0, %r13
1713
1714 mulq %rbx
1715 addq %rax, %r13
1716 movq 56($ap), %rax
1717 movq %rdx, %r14
1718 adcq \$0, %r14
1719
1720 mulq %rbx
1721 addq %rax, %r14
1722 movq ($ap), %rax
1723 movq %rdx, %r15
1724 adcq \$0, %r15
1725
1726 leaq 8($bp), $bp
1727 leaq 8(%rdi), %rdi
1728
1729 movl \$7, %ecx
1730 jmp .Loop_mul
1731
1732 .align 32
1733 .Loop_mul:
1734 movq ($bp), %rbx
1735 mulq %rbx
1736 addq %rax, %r8
1737 movq 8($ap), %rax
1738 movq %r8, (%rdi)
1739 movq %rdx, %r8
1740 adcq \$0, %r8
1741
1742 mulq %rbx
1743 addq %rax, %r9
1744 movq 16($ap), %rax
1745 adcq \$0, %rdx
1746 addq %r9, %r8
1747 movq %rdx, %r9
1748 adcq \$0, %r9
1749
1750 mulq %rbx
1751 addq %rax, %r10
1752 movq 24($ap), %rax
1753 adcq \$0, %rdx
1754 addq %r10, %r9
1755 movq %rdx, %r10
1756 adcq \$0, %r10
1757
1758 mulq %rbx
1759 addq %rax, %r11
1760 movq 32($ap), %rax
1761 adcq \$0, %rdx
1762 addq %r11, %r10
1763 movq %rdx, %r11
1764 adcq \$0, %r11
1765
1766 mulq %rbx
1767 addq %rax, %r12
1768 movq 40($ap), %rax
1769 adcq \$0, %rdx
1770 addq %r12, %r11
1771 movq %rdx, %r12
1772 adcq \$0, %r12
1773
1774 mulq %rbx
1775 addq %rax, %r13
1776 movq 48($ap), %rax
1777 adcq \$0, %rdx
1778 addq %r13, %r12
1779 movq %rdx, %r13
1780 adcq \$0, %r13
1781
1782 mulq %rbx
1783 addq %rax, %r14
1784 movq 56($ap), %rax
1785 adcq \$0, %rdx
1786 addq %r14, %r13
1787 movq %rdx, %r14
1788 leaq 8($bp), $bp
1789 adcq \$0, %r14
1790
1791 mulq %rbx
1792 addq %rax, %r15
1793 movq ($ap), %rax
1794 adcq \$0, %rdx
1795 addq %r15, %r14
1796 movq %rdx, %r15
1797 adcq \$0, %r15
1798
1799 leaq 8(%rdi), %rdi
1800
1801 decl %ecx
1802 jnz .Loop_mul
1803
1804 movq %r8, (%rdi)
1805 movq %r9, 8(%rdi)
1806 movq %r10, 16(%rdi)
1807 movq %r11, 24(%rdi)
1808 movq %r12, 32(%rdi)
1809 movq %r13, 40(%rdi)
1810 movq %r14, 48(%rdi)
1811 movq %r15, 56(%rdi)
1812
1813 ret
1814 .size __rsaz_512_mul,.-__rsaz_512_mul
1815 ___
1816 }
1817 if ($addx) {
1818 # __rsaz_512_mulx
1819 #
1820 # input: %rsi - ap, %rbp - bp
1821 # ouput:
1822 # clobbers: everything
1823 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1824 $code.=<<___;
1825 .type __rsaz_512_mulx,\@abi-omnipotent
1826 .align 32
1827 __rsaz_512_mulx:
1828 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
1829 mov \$-6, %rcx
1830
1831 mulx 8($ap), %rax, %r9
1832 movq %rbx, 8(%rsp)
1833
1834 mulx 16($ap), %rbx, %r10
1835 adc %rax, %r8
1836
1837 mulx 24($ap), %rax, %r11
1838 adc %rbx, %r9
1839
1840 mulx 32($ap), %rbx, %r12
1841 adc %rax, %r10
1842
1843 mulx 40($ap), %rax, %r13
1844 adc %rbx, %r11
1845
1846 mulx 48($ap), %rbx, %r14
1847 adc %rax, %r12
1848
1849 mulx 56($ap), %rax, %r15
1850 mov 8($bp), %rdx
1851 adc %rbx, %r13
1852 adc %rax, %r14
1853 adc \$0, %r15
1854
1855 xor $zero, $zero # cf=0,of=0
1856 jmp .Loop_mulx
1857
1858 .align 32
1859 .Loop_mulx:
1860 movq %r8, %rbx
1861 mulx ($ap), %rax, %r8
1862 adcx %rax, %rbx
1863 adox %r9, %r8
1864
1865 mulx 8($ap), %rax, %r9
1866 adcx %rax, %r8
1867 adox %r10, %r9
1868
1869 mulx 16($ap), %rax, %r10
1870 adcx %rax, %r9
1871 adox %r11, %r10
1872
1873 mulx 24($ap), %rax, %r11
1874 adcx %rax, %r10
1875 adox %r12, %r11
1876
1877 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1878 adcx %rax, %r11
1879 adox %r13, %r12
1880
1881 mulx 40($ap), %rax, %r13
1882 adcx %rax, %r12
1883 adox %r14, %r13
1884
1885 mulx 48($ap), %rax, %r14
1886 adcx %rax, %r13
1887 adox %r15, %r14
1888
1889 mulx 56($ap), %rax, %r15
1890 movq 64($bp,%rcx,8), %rdx
1891 movq %rbx, 8+64-8(%rsp,%rcx,8)
1892 adcx %rax, %r14
1893 adox $zero, %r15
1894 adcx $zero, %r15 # cf=0
1895
1896 inc %rcx # of=0
1897 jnz .Loop_mulx
1898
1899 movq %r8, %rbx
1900 mulx ($ap), %rax, %r8
1901 adcx %rax, %rbx
1902 adox %r9, %r8
1903
1904 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
1905 adcx %rax, %r8
1906 adox %r10, %r9
1907
1908 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
1909 adcx %rax, %r9
1910 adox %r11, %r10
1911
1912 mulx 24($ap), %rax, %r11
1913 adcx %rax, %r10
1914 adox %r12, %r11
1915
1916 mulx 32($ap), %rax, %r12
1917 adcx %rax, %r11
1918 adox %r13, %r12
1919
1920 mulx 40($ap), %rax, %r13
1921 adcx %rax, %r12
1922 adox %r14, %r13
1923
1924 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1925 adcx %rax, %r13
1926 adox %r15, %r14
1927
1928 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
1929 adcx %rax, %r14
1930 adox $zero, %r15
1931 adcx $zero, %r15
1932
1933 mov %rbx, 8+64-8(%rsp)
1934 mov %r8, 8+64(%rsp)
1935 mov %r9, 8+64+8(%rsp)
1936 mov %r10, 8+64+16(%rsp)
1937 mov %r11, 8+64+24(%rsp)
1938 mov %r12, 8+64+32(%rsp)
1939 mov %r13, 8+64+40(%rsp)
1940 mov %r14, 8+64+48(%rsp)
1941 mov %r15, 8+64+56(%rsp)
1942
1943 ret
1944 .size __rsaz_512_mulx,.-__rsaz_512_mulx
1945 ___
1946 }
1947 {
1948 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1949 $code.=<<___;
1950 .globl rsaz_512_scatter4
1951 .type rsaz_512_scatter4,\@abi-omnipotent
1952 .align 16
1953 rsaz_512_scatter4:
1954 leaq ($out,$power,4), $out
1955 movl \$8, %r9d
1956 jmp .Loop_scatter
1957 .align 16
1958 .Loop_scatter:
1959 movq ($inp), %rax
1960 leaq 8($inp), $inp
1961 movl %eax, ($out)
1962 shrq \$32, %rax
1963 movl %eax, 64($out)
1964 leaq 128($out), $out
1965 decl %r9d
1966 jnz .Loop_scatter
1967 ret
1968 .size rsaz_512_scatter4,.-rsaz_512_scatter4
1969
1970 .globl rsaz_512_gather4
1971 .type rsaz_512_gather4,\@abi-omnipotent
1972 .align 16
1973 rsaz_512_gather4:
1974 leaq ($inp,$power,4), $inp
1975 movl \$8, %r9d
1976 jmp .Loop_gather
1977 .align 16
1978 .Loop_gather:
1979 movl ($inp), %eax
1980 movl 64($inp), %r8d
1981 leaq 128($inp), $inp
1982 shlq \$32, %r8
1983 or %r8, %rax
1984 movq %rax, ($out)
1985 leaq 8($out), $out
1986 decl %r9d
1987 jnz .Loop_gather
1988 ret
1989 .size rsaz_512_gather4,.-rsaz_512_gather4
1990 ___
1991 }
1992
1993 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1994 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1995 if ($win64) {
1996 $rec="%rcx";
1997 $frame="%rdx";
1998 $context="%r8";
1999 $disp="%r9";
2000
2001 $code.=<<___;
2002 .extern __imp_RtlVirtualUnwind
2003 .type se_handler,\@abi-omnipotent
2004 .align 16
2005 se_handler:
2006 push %rsi
2007 push %rdi
2008 push %rbx
2009 push %rbp
2010 push %r12
2011 push %r13
2012 push %r14
2013 push %r15
2014 pushfq
2015 sub \$64,%rsp
2016
2017 mov 120($context),%rax # pull context->Rax
2018 mov 248($context),%rbx # pull context->Rip
2019
2020 mov 8($disp),%rsi # disp->ImageBase
2021 mov 56($disp),%r11 # disp->HandlerData
2022
2023 mov 0(%r11),%r10d # HandlerData[0]
2024 lea (%rsi,%r10),%r10 # end of prologue label
2025 cmp %r10,%rbx # context->Rip<end of prologue label
2026 jb .Lcommon_seh_tail
2027
2028 mov 152($context),%rax # pull context->Rsp
2029
2030 mov 4(%r11),%r10d # HandlerData[1]
2031 lea (%rsi,%r10),%r10 # epilogue label
2032 cmp %r10,%rbx # context->Rip>=epilogue label
2033 jae .Lcommon_seh_tail
2034
2035 lea 128+24+48(%rax),%rax
2036
2037 mov -8(%rax),%rbx
2038 mov -16(%rax),%rbp
2039 mov -24(%rax),%r12
2040 mov -32(%rax),%r13
2041 mov -40(%rax),%r14
2042 mov -48(%rax),%r15
2043 mov %rbx,144($context) # restore context->Rbx
2044 mov %rbp,160($context) # restore context->Rbp
2045 mov %r12,216($context) # restore context->R12
2046 mov %r13,224($context) # restore context->R13
2047 mov %r14,232($context) # restore context->R14
2048 mov %r15,240($context) # restore context->R15
2049
2050 .Lcommon_seh_tail:
2051 mov 8(%rax),%rdi
2052 mov 16(%rax),%rsi
2053 mov %rax,152($context) # restore context->Rsp
2054 mov %rsi,168($context) # restore context->Rsi
2055 mov %rdi,176($context) # restore context->Rdi
2056
2057 mov 40($disp),%rdi # disp->ContextRecord
2058 mov $context,%rsi # context
2059 mov \$154,%ecx # sizeof(CONTEXT)
2060 .long 0xa548f3fc # cld; rep movsq
2061
2062 mov $disp,%rsi
2063 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2064 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2065 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2066 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2067 mov 40(%rsi),%r10 # disp->ContextRecord
2068 lea 56(%rsi),%r11 # &disp->HandlerData
2069 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2070 mov %r10,32(%rsp) # arg5
2071 mov %r11,40(%rsp) # arg6
2072 mov %r12,48(%rsp) # arg7
2073 mov %rcx,56(%rsp) # arg8, (NULL)
2074 call *__imp_RtlVirtualUnwind(%rip)
2075
2076 mov \$1,%eax # ExceptionContinueSearch
2077 add \$64,%rsp
2078 popfq
2079 pop %r15
2080 pop %r14
2081 pop %r13
2082 pop %r12
2083 pop %rbp
2084 pop %rbx
2085 pop %rdi
2086 pop %rsi
2087 ret
2088 .size sqr_handler,.-sqr_handler
2089
2090 .section .pdata
2091 .align 4
2092 .rva .LSEH_begin_rsaz_512_sqr
2093 .rva .LSEH_end_rsaz_512_sqr
2094 .rva .LSEH_info_rsaz_512_sqr
2095
2096 .rva .LSEH_begin_rsaz_512_mul
2097 .rva .LSEH_end_rsaz_512_mul
2098 .rva .LSEH_info_rsaz_512_mul
2099
2100 .rva .LSEH_begin_rsaz_512_mul_gather4
2101 .rva .LSEH_end_rsaz_512_mul_gather4
2102 .rva .LSEH_info_rsaz_512_mul_gather4
2103
2104 .rva .LSEH_begin_rsaz_512_mul_scatter4
2105 .rva .LSEH_end_rsaz_512_mul_scatter4
2106 .rva .LSEH_info_rsaz_512_mul_scatter4
2107
2108 .rva .LSEH_begin_rsaz_512_mul_by_one
2109 .rva .LSEH_end_rsaz_512_mul_by_one
2110 .rva .LSEH_info_rsaz_512_mul_by_one
2111
2112 .section .xdata
2113 .align 8
2114 .LSEH_info_rsaz_512_sqr:
2115 .byte 9,0,0,0
2116 .rva se_handler
2117 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2118 .LSEH_info_rsaz_512_mul:
2119 .byte 9,0,0,0
2120 .rva se_handler
2121 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2122 .LSEH_info_rsaz_512_mul_gather4:
2123 .byte 9,0,0,0
2124 .rva se_handler
2125 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2126 .LSEH_info_rsaz_512_mul_scatter4:
2127 .byte 9,0,0,0
2128 .rva se_handler
2129 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2130 .LSEH_info_rsaz_512_mul_by_one:
2131 .byte 9,0,0,0
2132 .rva se_handler
2133 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2134 ___
2135 }
2136
2137 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2138 print $code;
2139 close STDOUT;