]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/rsaz-x86_64.pl
crypto/bn/rsaz*: fix licensing note.
[thirdparty/openssl.git] / crypto / bn / asm / rsaz-x86_64.pl
1 #!/usr/bin/env perl
2
3 ##############################################################################
4 # #
5 # Copyright (c) 2012, Intel Corporation #
6 # #
7 # All rights reserved. #
8 # #
9 # Redistribution and use in source and binary forms, with or without #
10 # modification, are permitted provided that the following conditions are #
11 # met: #
12 # #
13 # * Redistributions of source code must retain the above copyright #
14 # notice, this list of conditions and the following disclaimer. #
15 # #
16 # * Redistributions in binary form must reproduce the above copyright #
17 # notice, this list of conditions and the following disclaimer in the #
18 # documentation and/or other materials provided with the #
19 # distribution. #
20 # #
21 # * Neither the name of the Intel Corporation nor the names of its #
22 # contributors may be used to endorse or promote products derived from #
23 # this software without specific prior written permission. #
24 # #
25 # #
26 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
37 # #
38 ##############################################################################
39 # Developers and authors: #
40 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
41 # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
42 # Israel Development Center, Haifa, Israel #
43 # (2) University of Haifa #
44 ##############################################################################
45 # Reference: #
46 # [1] S. Gueron, "Efficient Software Implementations of Modular #
47 # Exponentiation", http://eprint.iacr.org/2011/239 #
48 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
49 # IEEE Proceedings of 9th International Conference on Information #
50 # Technology: New Generations (ITNG 2012), 821-823 (2012). #
51 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
52 # Journal of Cryptographic Engineering 2:31-43 (2012). #
53 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
54 # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
55 # RSA1024 and RSA2048 on x86_64 platforms", #
56 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
57 ##############################################################################
58
59 # While original submission covers 512- and 1024-bit exponentiation,
60 # this module is limited to 512-bit version only (and as such
61 # accelerates RSA1024 sign). This is because improvement for longer
62 # keys is not high enough to justify the effort, highest measured
63 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
64 # for the moment of this writing!] Nor does this module implement
65 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
66 # to more modular mixture of C and assembly. And it's optimized even
67 # for processors other than Intel Core family (see table below for
68 # improvement coefficients).
69 # <appro@openssl.org>
70 #
71 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
72 # ----------------+---------------------------
73 # Opteron +13% |+5% +20%
74 # Bulldozer -0% |-1% +10%
75 # P4 +11% |+7% +8%
76 # Westmere +5% |+14% +17%
77 # Sandy Bridge +2% |+12% +29%
78 # Ivy Bridge +1% |+11% +35%
79 # Haswell(**) -0% |+12% +39%
80 # Atom +13% |+11% +4%
81 # VIA Nano +70% |+9% +25%
82 #
83 # (*) rsax engine and fips numbers are presented for reference
84 # purposes;
85 # (**) MULX was attempted, but found to give only marginal improvement;
86
87 $flavour = shift;
88 $output = shift;
89 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
90
91 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
92
93 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96 die "can't locate x86_64-xlate.pl";
97
98 open OUT,"| $^X $xlate $flavour $output";
99 *STDOUT=*OUT;
100
101 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
103 $addx = ($1>=2.23);
104 }
105
106 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
107 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
108 $addx = ($1>=2.10);
109 }
110
111 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
112 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
113 $addx = ($1>=11);
114 }
115
116 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
117 {
118 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
119
120 $code.=<<___;
121 .text
122
123 .extern OPENSSL_ia32cap_P
124
125 .globl rsaz_512_sqr
126 .type rsaz_512_sqr,\@function,5
127 .align 32
128 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
129 push %rbx
130 push %rbp
131 push %r12
132 push %r13
133 push %r14
134 push %r15
135
136 subq \$128+24, %rsp
137 .Lsqr_body:
138 movq $mod, %rbp # common argument
139 movq ($inp), %rdx
140 movq 8($inp), %rax
141 movq $n0, 128(%rsp)
142 ___
143 $code.=<<___ if ($addx);
144 movl \$0x80100,%r11d
145 andl OPENSSL_ia32cap_P+8(%rip),%r11d
146 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
147 je .Loop_sqrx
148 ___
149 $code.=<<___;
150 jmp .Loop_sqr
151
152 .align 32
153 .Loop_sqr:
154 movl $times,128+8(%rsp)
155 #first iteration
156 movq %rdx, %rbx
157 mulq %rdx
158 movq %rax, %r8
159 movq 16($inp), %rax
160 movq %rdx, %r9
161
162 mulq %rbx
163 addq %rax, %r9
164 movq 24($inp), %rax
165 movq %rdx, %r10
166 adcq \$0, %r10
167
168 mulq %rbx
169 addq %rax, %r10
170 movq 32($inp), %rax
171 movq %rdx, %r11
172 adcq \$0, %r11
173
174 mulq %rbx
175 addq %rax, %r11
176 movq 40($inp), %rax
177 movq %rdx, %r12
178 adcq \$0, %r12
179
180 mulq %rbx
181 addq %rax, %r12
182 movq 48($inp), %rax
183 movq %rdx, %r13
184 adcq \$0, %r13
185
186 mulq %rbx
187 addq %rax, %r13
188 movq 56($inp), %rax
189 movq %rdx, %r14
190 adcq \$0, %r14
191
192 mulq %rbx
193 addq %rax, %r14
194 movq %rbx, %rax
195 movq %rdx, %r15
196 adcq \$0, %r15
197
198 addq %r8, %r8 #shlq \$1, %r8
199 movq %r9, %rcx
200 adcq %r9, %r9 #shld \$1, %r8, %r9
201
202 mulq %rax
203 movq %rax, (%rsp)
204 addq %rdx, %r8
205 adcq \$0, %r9
206
207 movq %r8, 8(%rsp)
208 shrq \$63, %rcx
209
210 #second iteration
211 movq 8($inp), %r8
212 movq 16($inp), %rax
213 mulq %r8
214 addq %rax, %r10
215 movq 24($inp), %rax
216 movq %rdx, %rbx
217 adcq \$0, %rbx
218
219 mulq %r8
220 addq %rax, %r11
221 movq 32($inp), %rax
222 adcq \$0, %rdx
223 addq %rbx, %r11
224 movq %rdx, %rbx
225 adcq \$0, %rbx
226
227 mulq %r8
228 addq %rax, %r12
229 movq 40($inp), %rax
230 adcq \$0, %rdx
231 addq %rbx, %r12
232 movq %rdx, %rbx
233 adcq \$0, %rbx
234
235 mulq %r8
236 addq %rax, %r13
237 movq 48($inp), %rax
238 adcq \$0, %rdx
239 addq %rbx, %r13
240 movq %rdx, %rbx
241 adcq \$0, %rbx
242
243 mulq %r8
244 addq %rax, %r14
245 movq 56($inp), %rax
246 adcq \$0, %rdx
247 addq %rbx, %r14
248 movq %rdx, %rbx
249 adcq \$0, %rbx
250
251 mulq %r8
252 addq %rax, %r15
253 movq %r8, %rax
254 adcq \$0, %rdx
255 addq %rbx, %r15
256 movq %rdx, %r8
257 movq %r10, %rdx
258 adcq \$0, %r8
259
260 add %rdx, %rdx
261 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
262 movq %r11, %rbx
263 adcq %r11, %r11 #shld \$1, %r10, %r11
264
265 mulq %rax
266 addq %rax, %r9
267 adcq %rdx, %r10
268 adcq \$0, %r11
269
270 movq %r9, 16(%rsp)
271 movq %r10, 24(%rsp)
272 shrq \$63, %rbx
273
274 #third iteration
275 movq 16($inp), %r9
276 movq 24($inp), %rax
277 mulq %r9
278 addq %rax, %r12
279 movq 32($inp), %rax
280 movq %rdx, %rcx
281 adcq \$0, %rcx
282
283 mulq %r9
284 addq %rax, %r13
285 movq 40($inp), %rax
286 adcq \$0, %rdx
287 addq %rcx, %r13
288 movq %rdx, %rcx
289 adcq \$0, %rcx
290
291 mulq %r9
292 addq %rax, %r14
293 movq 48($inp), %rax
294 adcq \$0, %rdx
295 addq %rcx, %r14
296 movq %rdx, %rcx
297 adcq \$0, %rcx
298
299 mulq %r9
300 movq %r12, %r10
301 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
302 addq %rax, %r15
303 movq 56($inp), %rax
304 adcq \$0, %rdx
305 addq %rcx, %r15
306 movq %rdx, %rcx
307 adcq \$0, %rcx
308
309 mulq %r9
310 shrq \$63, %r10
311 addq %rax, %r8
312 movq %r9, %rax
313 adcq \$0, %rdx
314 addq %rcx, %r8
315 movq %rdx, %r9
316 adcq \$0, %r9
317
318 movq %r13, %rcx
319 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
320
321 mulq %rax
322 addq %rax, %r11
323 adcq %rdx, %r12
324 adcq \$0, %r13
325
326 movq %r11, 32(%rsp)
327 movq %r12, 40(%rsp)
328 shrq \$63, %rcx
329
330 #fourth iteration
331 movq 24($inp), %r10
332 movq 32($inp), %rax
333 mulq %r10
334 addq %rax, %r14
335 movq 40($inp), %rax
336 movq %rdx, %rbx
337 adcq \$0, %rbx
338
339 mulq %r10
340 addq %rax, %r15
341 movq 48($inp), %rax
342 adcq \$0, %rdx
343 addq %rbx, %r15
344 movq %rdx, %rbx
345 adcq \$0, %rbx
346
347 mulq %r10
348 movq %r14, %r12
349 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
350 addq %rax, %r8
351 movq 56($inp), %rax
352 adcq \$0, %rdx
353 addq %rbx, %r8
354 movq %rdx, %rbx
355 adcq \$0, %rbx
356
357 mulq %r10
358 shrq \$63, %r12
359 addq %rax, %r9
360 movq %r10, %rax
361 adcq \$0, %rdx
362 addq %rbx, %r9
363 movq %rdx, %r10
364 adcq \$0, %r10
365
366 movq %r15, %rbx
367 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
368
369 mulq %rax
370 addq %rax, %r13
371 adcq %rdx, %r14
372 adcq \$0, %r15
373
374 movq %r13, 48(%rsp)
375 movq %r14, 56(%rsp)
376 shrq \$63, %rbx
377
378 #fifth iteration
379 movq 32($inp), %r11
380 movq 40($inp), %rax
381 mulq %r11
382 addq %rax, %r8
383 movq 48($inp), %rax
384 movq %rdx, %rcx
385 adcq \$0, %rcx
386
387 mulq %r11
388 addq %rax, %r9
389 movq 56($inp), %rax
390 adcq \$0, %rdx
391 movq %r8, %r12
392 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
393 addq %rcx, %r9
394 movq %rdx, %rcx
395 adcq \$0, %rcx
396
397 mulq %r11
398 shrq \$63, %r12
399 addq %rax, %r10
400 movq %r11, %rax
401 adcq \$0, %rdx
402 addq %rcx, %r10
403 movq %rdx, %r11
404 adcq \$0, %r11
405
406 movq %r9, %rcx
407 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
408
409 mulq %rax
410 addq %rax, %r15
411 adcq %rdx, %r8
412 adcq \$0, %r9
413
414 movq %r15, 64(%rsp)
415 movq %r8, 72(%rsp)
416 shrq \$63, %rcx
417
418 #sixth iteration
419 movq 40($inp), %r12
420 movq 48($inp), %rax
421 mulq %r12
422 addq %rax, %r10
423 movq 56($inp), %rax
424 movq %rdx, %rbx
425 adcq \$0, %rbx
426
427 mulq %r12
428 addq %rax, %r11
429 movq %r12, %rax
430 movq %r10, %r15
431 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
432 adcq \$0, %rdx
433 shrq \$63, %r15
434 addq %rbx, %r11
435 movq %rdx, %r12
436 adcq \$0, %r12
437
438 movq %r11, %rbx
439 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
440
441 mulq %rax
442 addq %rax, %r9
443 adcq %rdx, %r10
444 adcq \$0, %r11
445
446 movq %r9, 80(%rsp)
447 movq %r10, 88(%rsp)
448
449 #seventh iteration
450 movq 48($inp), %r13
451 movq 56($inp), %rax
452 mulq %r13
453 addq %rax, %r12
454 movq %r13, %rax
455 movq %rdx, %r13
456 adcq \$0, %r13
457
458 xorq %r14, %r14
459 shlq \$1, %rbx
460 adcq %r12, %r12 #shld \$1, %rbx, %r12
461 adcq %r13, %r13 #shld \$1, %r12, %r13
462 adcq %r14, %r14 #shld \$1, %r13, %r14
463
464 mulq %rax
465 addq %rax, %r11
466 adcq %rdx, %r12
467 adcq \$0, %r13
468
469 movq %r11, 96(%rsp)
470 movq %r12, 104(%rsp)
471
472 #eighth iteration
473 movq 56($inp), %rax
474 mulq %rax
475 addq %rax, %r13
476 adcq \$0, %rdx
477
478 addq %rdx, %r14
479
480 movq %r13, 112(%rsp)
481 movq %r14, 120(%rsp)
482
483 movq (%rsp), %r8
484 movq 8(%rsp), %r9
485 movq 16(%rsp), %r10
486 movq 24(%rsp), %r11
487 movq 32(%rsp), %r12
488 movq 40(%rsp), %r13
489 movq 48(%rsp), %r14
490 movq 56(%rsp), %r15
491
492 call __rsaz_512_reduce
493
494 addq 64(%rsp), %r8
495 adcq 72(%rsp), %r9
496 adcq 80(%rsp), %r10
497 adcq 88(%rsp), %r11
498 adcq 96(%rsp), %r12
499 adcq 104(%rsp), %r13
500 adcq 112(%rsp), %r14
501 adcq 120(%rsp), %r15
502 sbbq %rcx, %rcx
503
504 call __rsaz_512_subtract
505
506 movq %r8, %rdx
507 movq %r9, %rax
508 movl 128+8(%rsp), $times
509 movq $out, $inp
510
511 decl $times
512 jnz .Loop_sqr
513 ___
514 if ($addx) {
515 $code.=<<___;
516 jmp .Lsqr_tail
517
518 .align 32
519 .Loop_sqrx:
520 movl $times,128+8(%rsp)
521 movq $out, %xmm0 # off-load
522 movq %rbp, %xmm1 # off-load
523 #first iteration
524 mulx %rax, %r8, %r9
525
526 mulx 16($inp), %rcx, %r10
527 xor %rbp, %rbp # cf=0, of=0
528
529 mulx 24($inp), %rax, %r11
530 adcx %rcx, %r9
531
532 mulx 32($inp), %rcx, %r12
533 adcx %rax, %r10
534
535 mulx 40($inp), %rax, %r13
536 adcx %rcx, %r11
537
538 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
539 adcx %rax, %r12
540 adcx %rcx, %r13
541
542 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
543 adcx %rax, %r14
544 adcx %rbp, %r15 # %rbp is 0
545
546 mov %r9, %rcx
547 shld \$1, %r8, %r9
548 shl \$1, %r8
549
550 xor %ebp, %ebp
551 mulx %rdx, %rax, %rdx
552 adcx %rdx, %r8
553 mov 8($inp), %rdx
554 adcx %rbp, %r9
555
556 mov %rax, (%rsp)
557 mov %r8, 8(%rsp)
558
559 #second iteration
560 mulx 16($inp), %rax, %rbx
561 adox %rax, %r10
562 adcx %rbx, %r11
563
564 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
565 adox $out, %r11
566 adcx %r8, %r12
567
568 mulx 32($inp), %rax, %rbx
569 adox %rax, %r12
570 adcx %rbx, %r13
571
572 mulx 40($inp), $out, %r8
573 adox $out, %r13
574 adcx %r8, %r14
575
576 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
577 adox %rax, %r14
578 adcx %rbx, %r15
579
580 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
581 adox $out, %r15
582 adcx %rbp, %r8
583 adox %rbp, %r8
584
585 mov %r11, %rbx
586 shld \$1, %r10, %r11
587 shld \$1, %rcx, %r10
588
589 xor %ebp,%ebp
590 mulx %rdx, %rax, %rcx
591 mov 16($inp), %rdx
592 adcx %rax, %r9
593 adcx %rcx, %r10
594 adcx %rbp, %r11
595
596 mov %r9, 16(%rsp)
597 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
598
599 #third iteration
600 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
601 adox $out, %r12
602 adcx %r9, %r13
603
604 mulx 32($inp), %rax, %rcx
605 adox %rax, %r13
606 adcx %rcx, %r14
607
608 mulx 40($inp), $out, %r9
609 adox $out, %r14
610 adcx %r9, %r15
611
612 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
613 adox %rax, %r15
614 adcx %rcx, %r8
615
616 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
617 adox $out, %r8
618 adcx %rbp, %r9
619 adox %rbp, %r9
620
621 mov %r13, %rcx
622 shld \$1, %r12, %r13
623 shld \$1, %rbx, %r12
624
625 xor %ebp, %ebp
626 mulx %rdx, %rax, %rdx
627 adcx %rax, %r11
628 adcx %rdx, %r12
629 mov 24($inp), %rdx
630 adcx %rbp, %r13
631
632 mov %r11, 32(%rsp)
633 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
634
635 #fourth iteration
636 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
637 adox %rax, %r14
638 adcx %rbx, %r15
639
640 mulx 40($inp), $out, %r10
641 adox $out, %r15
642 adcx %r10, %r8
643
644 mulx 48($inp), %rax, %rbx
645 adox %rax, %r8
646 adcx %rbx, %r9
647
648 mulx 56($inp), $out, %r10
649 adox $out, %r9
650 adcx %rbp, %r10
651 adox %rbp, %r10
652
653 .byte 0x66
654 mov %r15, %rbx
655 shld \$1, %r14, %r15
656 shld \$1, %rcx, %r14
657
658 xor %ebp, %ebp
659 mulx %rdx, %rax, %rdx
660 adcx %rax, %r13
661 adcx %rdx, %r14
662 mov 32($inp), %rdx
663 adcx %rbp, %r15
664
665 mov %r13, 48(%rsp)
666 mov %r14, 56(%rsp)
667
668 #fifth iteration
669 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
670 adox $out, %r8
671 adcx %r11, %r9
672
673 mulx 48($inp), %rax, %rcx
674 adox %rax, %r9
675 adcx %rcx, %r10
676
677 mulx 56($inp), $out, %r11
678 adox $out, %r10
679 adcx %rbp, %r11
680 adox %rbp, %r11
681
682 mov %r9, %rcx
683 shld \$1, %r8, %r9
684 shld \$1, %rbx, %r8
685
686 xor %ebp, %ebp
687 mulx %rdx, %rax, %rdx
688 adcx %rax, %r15
689 adcx %rdx, %r8
690 mov 40($inp), %rdx
691 adcx %rbp, %r9
692
693 mov %r15, 64(%rsp)
694 mov %r8, 72(%rsp)
695
696 #sixth iteration
697 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
698 adox %rax, %r10
699 adcx %rbx, %r11
700
701 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
702 adox $out, %r11
703 adcx %rbp, %r12
704 adox %rbp, %r12
705
706 mov %r11, %rbx
707 shld \$1, %r10, %r11
708 shld \$1, %rcx, %r10
709
710 xor %ebp, %ebp
711 mulx %rdx, %rax, %rdx
712 adcx %rax, %r9
713 adcx %rdx, %r10
714 mov 48($inp), %rdx
715 adcx %rbp, %r11
716
717 mov %r9, 80(%rsp)
718 mov %r10, 88(%rsp)
719
720 #seventh iteration
721 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
722 adox %rax, %r12
723 adox %rbp, %r13
724
725 xor %r14, %r14
726 shld \$1, %r13, %r14
727 shld \$1, %r12, %r13
728 shld \$1, %rbx, %r12
729
730 xor %ebp, %ebp
731 mulx %rdx, %rax, %rdx
732 adcx %rax, %r11
733 adcx %rdx, %r12
734 mov 56($inp), %rdx
735 adcx %rbp, %r13
736
737 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
738 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
739
740 #eighth iteration
741 mulx %rdx, %rax, %rdx
742 adox %rax, %r13
743 adox %rbp, %rdx
744
745 .byte 0x66
746 add %rdx, %r14
747
748 movq %r13, 112(%rsp)
749 movq %r14, 120(%rsp)
750 movq %xmm0, $out
751 movq %xmm1, %rbp
752
753 movq 128(%rsp), %rdx # pull $n0
754 movq (%rsp), %r8
755 movq 8(%rsp), %r9
756 movq 16(%rsp), %r10
757 movq 24(%rsp), %r11
758 movq 32(%rsp), %r12
759 movq 40(%rsp), %r13
760 movq 48(%rsp), %r14
761 movq 56(%rsp), %r15
762
763 call __rsaz_512_reducex
764
765 addq 64(%rsp), %r8
766 adcq 72(%rsp), %r9
767 adcq 80(%rsp), %r10
768 adcq 88(%rsp), %r11
769 adcq 96(%rsp), %r12
770 adcq 104(%rsp), %r13
771 adcq 112(%rsp), %r14
772 adcq 120(%rsp), %r15
773 sbbq %rcx, %rcx
774
775 call __rsaz_512_subtract
776
777 movq %r8, %rdx
778 movq %r9, %rax
779 movl 128+8(%rsp), $times
780 movq $out, $inp
781
782 decl $times
783 jnz .Loop_sqrx
784
785 .Lsqr_tail:
786 ___
787 }
788 $code.=<<___;
789
790 leaq 128+24+48(%rsp), %rax
791 movq -48(%rax), %r15
792 movq -40(%rax), %r14
793 movq -32(%rax), %r13
794 movq -24(%rax), %r12
795 movq -16(%rax), %rbp
796 movq -8(%rax), %rbx
797 leaq (%rax), %rsp
798 .Lsqr_epilogue:
799 ret
800 .size rsaz_512_sqr,.-rsaz_512_sqr
801 ___
802 }
803 {
804 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
805 $code.=<<___;
806 .globl rsaz_512_mul
807 .type rsaz_512_mul,\@function,5
808 .align 32
809 rsaz_512_mul:
810 push %rbx
811 push %rbp
812 push %r12
813 push %r13
814 push %r14
815 push %r15
816
817 subq \$128+24, %rsp
818 .Lmul_body:
819 movq $out, %xmm0 # off-load arguments
820 movq $mod, %xmm1
821 movq $n0, 128(%rsp)
822 ___
823 $code.=<<___ if ($addx);
824 movl \$0x80100,%r11d
825 andl OPENSSL_ia32cap_P+8(%rip),%r11d
826 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
827 je .Lmulx
828 ___
829 $code.=<<___;
830 movq ($bp), %rbx # pass b[0]
831 movq $bp, %rbp # pass argument
832 call __rsaz_512_mul
833
834 movq %xmm0, $out
835 movq %xmm1, %rbp
836
837 movq (%rsp), %r8
838 movq 8(%rsp), %r9
839 movq 16(%rsp), %r10
840 movq 24(%rsp), %r11
841 movq 32(%rsp), %r12
842 movq 40(%rsp), %r13
843 movq 48(%rsp), %r14
844 movq 56(%rsp), %r15
845
846 call __rsaz_512_reduce
847 ___
848 $code.=<<___ if ($addx);
849 jmp .Lmul_tail
850
851 .align 32
852 .Lmulx:
853 movq $bp, %rbp # pass argument
854 movq ($bp), %rdx # pass b[0]
855 call __rsaz_512_mulx
856
857 movq %xmm0, $out
858 movq %xmm1, %rbp
859
860 movq 128(%rsp), %rdx # pull $n0
861 movq (%rsp), %r8
862 movq 8(%rsp), %r9
863 movq 16(%rsp), %r10
864 movq 24(%rsp), %r11
865 movq 32(%rsp), %r12
866 movq 40(%rsp), %r13
867 movq 48(%rsp), %r14
868 movq 56(%rsp), %r15
869
870 call __rsaz_512_reducex
871 .Lmul_tail:
872 ___
873 $code.=<<___;
874 addq 64(%rsp), %r8
875 adcq 72(%rsp), %r9
876 adcq 80(%rsp), %r10
877 adcq 88(%rsp), %r11
878 adcq 96(%rsp), %r12
879 adcq 104(%rsp), %r13
880 adcq 112(%rsp), %r14
881 adcq 120(%rsp), %r15
882 sbbq %rcx, %rcx
883
884 call __rsaz_512_subtract
885
886 leaq 128+24+48(%rsp), %rax
887 movq -48(%rax), %r15
888 movq -40(%rax), %r14
889 movq -32(%rax), %r13
890 movq -24(%rax), %r12
891 movq -16(%rax), %rbp
892 movq -8(%rax), %rbx
893 leaq (%rax), %rsp
894 .Lmul_epilogue:
895 ret
896 .size rsaz_512_mul,.-rsaz_512_mul
897 ___
898 }
899 {
900 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
901 $code.=<<___;
902 .globl rsaz_512_mul_gather4
903 .type rsaz_512_mul_gather4,\@function,6
904 .align 32
905 rsaz_512_mul_gather4:
906 push %rbx
907 push %rbp
908 push %r12
909 push %r13
910 push %r14
911 push %r15
912
913 subq \$128+24, %rsp
914 .Lmul_gather4_body:
915 ___
916 $code.=<<___ if ($addx);
917 movl \$0x80100,%r11d
918 andl OPENSSL_ia32cap_P+8(%rip),%r11d
919 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
920 je .Lmulx_gather
921 ___
922 $code.=<<___;
923 movl 64($bp,$pwr,4), %eax
924 movq $out, %xmm0 # off-load arguments
925 movl ($bp,$pwr,4), %ebx
926 movq $mod, %xmm1
927 movq $n0, 128(%rsp)
928
929 shlq \$32, %rax
930 or %rax, %rbx
931 movq ($ap), %rax
932 movq 8($ap), %rcx
933 leaq 128($bp,$pwr,4), %rbp
934 mulq %rbx # 0 iteration
935 movq %rax, (%rsp)
936 movq %rcx, %rax
937 movq %rdx, %r8
938
939 mulq %rbx
940 movd (%rbp), %xmm4
941 addq %rax, %r8
942 movq 16($ap), %rax
943 movq %rdx, %r9
944 adcq \$0, %r9
945
946 mulq %rbx
947 movd 64(%rbp), %xmm5
948 addq %rax, %r9
949 movq 24($ap), %rax
950 movq %rdx, %r10
951 adcq \$0, %r10
952
953 mulq %rbx
954 pslldq \$4, %xmm5
955 addq %rax, %r10
956 movq 32($ap), %rax
957 movq %rdx, %r11
958 adcq \$0, %r11
959
960 mulq %rbx
961 por %xmm5, %xmm4
962 addq %rax, %r11
963 movq 40($ap), %rax
964 movq %rdx, %r12
965 adcq \$0, %r12
966
967 mulq %rbx
968 addq %rax, %r12
969 movq 48($ap), %rax
970 movq %rdx, %r13
971 adcq \$0, %r13
972
973 mulq %rbx
974 leaq 128(%rbp), %rbp
975 addq %rax, %r13
976 movq 56($ap), %rax
977 movq %rdx, %r14
978 adcq \$0, %r14
979
980 mulq %rbx
981 movq %xmm4, %rbx
982 addq %rax, %r14
983 movq ($ap), %rax
984 movq %rdx, %r15
985 adcq \$0, %r15
986
987 leaq 8(%rsp), %rdi
988 movl \$7, %ecx
989 jmp .Loop_mul_gather
990
991 .align 32
992 .Loop_mul_gather:
993 mulq %rbx
994 addq %rax, %r8
995 movq 8($ap), %rax
996 movq %r8, (%rdi)
997 movq %rdx, %r8
998 adcq \$0, %r8
999
1000 mulq %rbx
1001 movd (%rbp), %xmm4
1002 addq %rax, %r9
1003 movq 16($ap), %rax
1004 adcq \$0, %rdx
1005 addq %r9, %r8
1006 movq %rdx, %r9
1007 adcq \$0, %r9
1008
1009 mulq %rbx
1010 movd 64(%rbp), %xmm5
1011 addq %rax, %r10
1012 movq 24($ap), %rax
1013 adcq \$0, %rdx
1014 addq %r10, %r9
1015 movq %rdx, %r10
1016 adcq \$0, %r10
1017
1018 mulq %rbx
1019 pslldq \$4, %xmm5
1020 addq %rax, %r11
1021 movq 32($ap), %rax
1022 adcq \$0, %rdx
1023 addq %r11, %r10
1024 movq %rdx, %r11
1025 adcq \$0, %r11
1026
1027 mulq %rbx
1028 por %xmm5, %xmm4
1029 addq %rax, %r12
1030 movq 40($ap), %rax
1031 adcq \$0, %rdx
1032 addq %r12, %r11
1033 movq %rdx, %r12
1034 adcq \$0, %r12
1035
1036 mulq %rbx
1037 addq %rax, %r13
1038 movq 48($ap), %rax
1039 adcq \$0, %rdx
1040 addq %r13, %r12
1041 movq %rdx, %r13
1042 adcq \$0, %r13
1043
1044 mulq %rbx
1045 addq %rax, %r14
1046 movq 56($ap), %rax
1047 adcq \$0, %rdx
1048 addq %r14, %r13
1049 movq %rdx, %r14
1050 adcq \$0, %r14
1051
1052 mulq %rbx
1053 movq %xmm4, %rbx
1054 addq %rax, %r15
1055 movq ($ap), %rax
1056 adcq \$0, %rdx
1057 addq %r15, %r14
1058 movq %rdx, %r15
1059 adcq \$0, %r15
1060
1061 leaq 128(%rbp), %rbp
1062 leaq 8(%rdi), %rdi
1063
1064 decl %ecx
1065 jnz .Loop_mul_gather
1066
1067 movq %r8, (%rdi)
1068 movq %r9, 8(%rdi)
1069 movq %r10, 16(%rdi)
1070 movq %r11, 24(%rdi)
1071 movq %r12, 32(%rdi)
1072 movq %r13, 40(%rdi)
1073 movq %r14, 48(%rdi)
1074 movq %r15, 56(%rdi)
1075
1076 movq %xmm0, $out
1077 movq %xmm1, %rbp
1078
1079 movq (%rsp), %r8
1080 movq 8(%rsp), %r9
1081 movq 16(%rsp), %r10
1082 movq 24(%rsp), %r11
1083 movq 32(%rsp), %r12
1084 movq 40(%rsp), %r13
1085 movq 48(%rsp), %r14
1086 movq 56(%rsp), %r15
1087
1088 call __rsaz_512_reduce
1089 ___
1090 $code.=<<___ if ($addx);
1091 jmp .Lmul_gather_tail
1092
1093 .align 32
1094 .Lmulx_gather:
1095 mov 64($bp,$pwr,4), %eax
1096 movq $out, %xmm0 # off-load arguments
1097 lea 128($bp,$pwr,4), %rbp
1098 mov ($bp,$pwr,4), %edx
1099 movq $mod, %xmm1
1100 mov $n0, 128(%rsp)
1101
1102 shl \$32, %rax
1103 or %rax, %rdx
1104 mulx ($ap), %rbx, %r8 # 0 iteration
1105 mov %rbx, (%rsp)
1106 xor %edi, %edi # cf=0, of=0
1107
1108 mulx 8($ap), %rax, %r9
1109 movd (%rbp), %xmm4
1110
1111 mulx 16($ap), %rbx, %r10
1112 movd 64(%rbp), %xmm5
1113 adcx %rax, %r8
1114
1115 mulx 24($ap), %rax, %r11
1116 pslldq \$4, %xmm5
1117 adcx %rbx, %r9
1118
1119 mulx 32($ap), %rbx, %r12
1120 por %xmm5, %xmm4
1121 adcx %rax, %r10
1122
1123 mulx 40($ap), %rax, %r13
1124 adcx %rbx, %r11
1125
1126 mulx 48($ap), %rbx, %r14
1127 lea 128(%rbp), %rbp
1128 adcx %rax, %r12
1129
1130 mulx 56($ap), %rax, %r15
1131 movq %xmm4, %rdx
1132 adcx %rbx, %r13
1133 adcx %rax, %r14
1134 mov %r8, %rbx
1135 adcx %rdi, %r15 # %rdi is 0
1136
1137 mov \$-7, %rcx
1138 jmp .Loop_mulx_gather
1139
1140 .align 32
1141 .Loop_mulx_gather:
1142 mulx ($ap), %rax, %r8
1143 adcx %rax, %rbx
1144 adox %r9, %r8
1145
1146 mulx 8($ap), %rax, %r9
1147 .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
1148 adcx %rax, %r8
1149 adox %r10, %r9
1150
1151 mulx 16($ap), %rax, %r10
1152 movd 64(%rbp), %xmm5
1153 lea 128(%rbp), %rbp
1154 adcx %rax, %r9
1155 adox %r11, %r10
1156
1157 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1158 pslldq \$4, %xmm5
1159 por %xmm5, %xmm4
1160 adcx %rax, %r10
1161 adox %r12, %r11
1162
1163 mulx 32($ap), %rax, %r12
1164 adcx %rax, %r11
1165 adox %r13, %r12
1166
1167 mulx 40($ap), %rax, %r13
1168 adcx %rax, %r12
1169 adox %r14, %r13
1170
1171 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1172 adcx %rax, %r13
1173 adox %r15, %r14
1174
1175 mulx 56($ap), %rax, %r15
1176 movq %xmm4, %rdx
1177 mov %rbx, 64(%rsp,%rcx,8)
1178 adcx %rax, %r14
1179 adox %rdi, %r15
1180 mov %r8, %rbx
1181 adcx %rdi, %r15 # cf=0
1182
1183 inc %rcx # of=0
1184 jnz .Loop_mulx_gather
1185
1186 mov %r8, 64(%rsp)
1187 mov %r9, 64+8(%rsp)
1188 mov %r10, 64+16(%rsp)
1189 mov %r11, 64+24(%rsp)
1190 mov %r12, 64+32(%rsp)
1191 mov %r13, 64+40(%rsp)
1192 mov %r14, 64+48(%rsp)
1193 mov %r15, 64+56(%rsp)
1194
1195 movq %xmm0, $out
1196 movq %xmm1, %rbp
1197
1198 mov 128(%rsp), %rdx # pull $n0
1199 mov (%rsp), %r8
1200 mov 8(%rsp), %r9
1201 mov 16(%rsp), %r10
1202 mov 24(%rsp), %r11
1203 mov 32(%rsp), %r12
1204 mov 40(%rsp), %r13
1205 mov 48(%rsp), %r14
1206 mov 56(%rsp), %r15
1207
1208 call __rsaz_512_reducex
1209
1210 .Lmul_gather_tail:
1211 ___
1212 $code.=<<___;
1213 addq 64(%rsp), %r8
1214 adcq 72(%rsp), %r9
1215 adcq 80(%rsp), %r10
1216 adcq 88(%rsp), %r11
1217 adcq 96(%rsp), %r12
1218 adcq 104(%rsp), %r13
1219 adcq 112(%rsp), %r14
1220 adcq 120(%rsp), %r15
1221 sbbq %rcx, %rcx
1222
1223 call __rsaz_512_subtract
1224
1225 leaq 128+24+48(%rsp), %rax
1226 movq -48(%rax), %r15
1227 movq -40(%rax), %r14
1228 movq -32(%rax), %r13
1229 movq -24(%rax), %r12
1230 movq -16(%rax), %rbp
1231 movq -8(%rax), %rbx
1232 leaq (%rax), %rsp
1233 .Lmul_gather4_epilogue:
1234 ret
1235 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1236 ___
1237 }
1238 {
1239 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1240 $code.=<<___;
1241 .globl rsaz_512_mul_scatter4
1242 .type rsaz_512_mul_scatter4,\@function,6
1243 .align 32
1244 rsaz_512_mul_scatter4:
1245 push %rbx
1246 push %rbp
1247 push %r12
1248 push %r13
1249 push %r14
1250 push %r15
1251
1252 subq \$128+24, %rsp
1253 .Lmul_scatter4_body:
1254 leaq ($tbl,$pwr,4), $tbl
1255 movq $out, %xmm0 # off-load arguments
1256 movq $mod, %xmm1
1257 movq $tbl, %xmm2
1258 movq $n0, 128(%rsp)
1259
1260 movq $out, %rbp
1261 ___
1262 $code.=<<___ if ($addx);
1263 movl \$0x80100,%r11d
1264 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1265 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1266 je .Lmulx_scatter
1267 ___
1268 $code.=<<___;
1269 movq ($out),%rbx # pass b[0]
1270 call __rsaz_512_mul
1271
1272 movq %xmm0, $out
1273 movq %xmm1, %rbp
1274
1275 movq (%rsp), %r8
1276 movq 8(%rsp), %r9
1277 movq 16(%rsp), %r10
1278 movq 24(%rsp), %r11
1279 movq 32(%rsp), %r12
1280 movq 40(%rsp), %r13
1281 movq 48(%rsp), %r14
1282 movq 56(%rsp), %r15
1283
1284 call __rsaz_512_reduce
1285 ___
1286 $code.=<<___ if ($addx);
1287 jmp .Lmul_scatter_tail
1288
1289 .align 32
1290 .Lmulx_scatter:
1291 movq ($out), %rdx # pass b[0]
1292 call __rsaz_512_mulx
1293
1294 movq %xmm0, $out
1295 movq %xmm1, %rbp
1296
1297 movq 128(%rsp), %rdx # pull $n0
1298 movq (%rsp), %r8
1299 movq 8(%rsp), %r9
1300 movq 16(%rsp), %r10
1301 movq 24(%rsp), %r11
1302 movq 32(%rsp), %r12
1303 movq 40(%rsp), %r13
1304 movq 48(%rsp), %r14
1305 movq 56(%rsp), %r15
1306
1307 call __rsaz_512_reducex
1308
1309 .Lmul_scatter_tail:
1310 ___
1311 $code.=<<___;
1312 addq 64(%rsp), %r8
1313 adcq 72(%rsp), %r9
1314 adcq 80(%rsp), %r10
1315 adcq 88(%rsp), %r11
1316 adcq 96(%rsp), %r12
1317 adcq 104(%rsp), %r13
1318 adcq 112(%rsp), %r14
1319 adcq 120(%rsp), %r15
1320 movq %xmm2, $inp
1321 sbbq %rcx, %rcx
1322
1323 call __rsaz_512_subtract
1324
1325 movl %r8d, 64*0($inp) # scatter
1326 shrq \$32, %r8
1327 movl %r9d, 64*2($inp)
1328 shrq \$32, %r9
1329 movl %r10d, 64*4($inp)
1330 shrq \$32, %r10
1331 movl %r11d, 64*6($inp)
1332 shrq \$32, %r11
1333 movl %r12d, 64*8($inp)
1334 shrq \$32, %r12
1335 movl %r13d, 64*10($inp)
1336 shrq \$32, %r13
1337 movl %r14d, 64*12($inp)
1338 shrq \$32, %r14
1339 movl %r15d, 64*14($inp)
1340 shrq \$32, %r15
1341 movl %r8d, 64*1($inp)
1342 movl %r9d, 64*3($inp)
1343 movl %r10d, 64*5($inp)
1344 movl %r11d, 64*7($inp)
1345 movl %r12d, 64*9($inp)
1346 movl %r13d, 64*11($inp)
1347 movl %r14d, 64*13($inp)
1348 movl %r15d, 64*15($inp)
1349
1350 leaq 128+24+48(%rsp), %rax
1351 movq -48(%rax), %r15
1352 movq -40(%rax), %r14
1353 movq -32(%rax), %r13
1354 movq -24(%rax), %r12
1355 movq -16(%rax), %rbp
1356 movq -8(%rax), %rbx
1357 leaq (%rax), %rsp
1358 .Lmul_scatter4_epilogue:
1359 ret
1360 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1361 ___
1362 }
1363 {
1364 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1365 $code.=<<___;
1366 .globl rsaz_512_mul_by_one
1367 .type rsaz_512_mul_by_one,\@function,4
1368 .align 32
1369 rsaz_512_mul_by_one:
1370 push %rbx
1371 push %rbp
1372 push %r12
1373 push %r13
1374 push %r14
1375 push %r15
1376
1377 subq \$128+24, %rsp
1378 .Lmul_by_one_body:
1379 ___
1380 $code.=<<___ if ($addx);
1381 movl OPENSSL_ia32cap_P+8(%rip),%eax
1382 ___
1383 $code.=<<___;
1384 movq $mod, %rbp # reassign argument
1385 movq $n0, 128(%rsp)
1386
1387 movq ($inp), %r8
1388 pxor %xmm0, %xmm0
1389 movq 8($inp), %r9
1390 movq 16($inp), %r10
1391 movq 24($inp), %r11
1392 movq 32($inp), %r12
1393 movq 40($inp), %r13
1394 movq 48($inp), %r14
1395 movq 56($inp), %r15
1396
1397 movdqa %xmm0, (%rsp)
1398 movdqa %xmm0, 16(%rsp)
1399 movdqa %xmm0, 32(%rsp)
1400 movdqa %xmm0, 48(%rsp)
1401 movdqa %xmm0, 64(%rsp)
1402 movdqa %xmm0, 80(%rsp)
1403 movdqa %xmm0, 96(%rsp)
1404 ___
1405 $code.=<<___ if ($addx);
1406 andl \$0x80100,%eax
1407 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1408 je .Lby_one_callx
1409 ___
1410 $code.=<<___;
1411 call __rsaz_512_reduce
1412 ___
1413 $code.=<<___ if ($addx);
1414 jmp .Lby_one_tail
1415 .align 32
1416 .Lby_one_callx:
1417 movq 128(%rsp), %rdx # pull $n0
1418 call __rsaz_512_reducex
1419 .Lby_one_tail:
1420 ___
1421 $code.=<<___;
1422 movq %r8, ($out)
1423 movq %r9, 8($out)
1424 movq %r10, 16($out)
1425 movq %r11, 24($out)
1426 movq %r12, 32($out)
1427 movq %r13, 40($out)
1428 movq %r14, 48($out)
1429 movq %r15, 56($out)
1430
1431 leaq 128+24+48(%rsp), %rax
1432 movq -48(%rax), %r15
1433 movq -40(%rax), %r14
1434 movq -32(%rax), %r13
1435 movq -24(%rax), %r12
1436 movq -16(%rax), %rbp
1437 movq -8(%rax), %rbx
1438 leaq (%rax), %rsp
1439 .Lmul_by_one_epilogue:
1440 ret
1441 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1442 ___
1443 }
1444 { # __rsaz_512_reduce
1445 #
1446 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1447 # output: %r8-%r15
1448 # clobbers: everything except %rbp and %rdi
1449 $code.=<<___;
1450 .type __rsaz_512_reduce,\@abi-omnipotent
1451 .align 32
1452 __rsaz_512_reduce:
1453 movq %r8, %rbx
1454 imulq 128+8(%rsp), %rbx
1455 movq 0(%rbp), %rax
1456 movl \$8, %ecx
1457 jmp .Lreduction_loop
1458
1459 .align 32
1460 .Lreduction_loop:
1461 mulq %rbx
1462 movq 8(%rbp), %rax
1463 negq %r8
1464 movq %rdx, %r8
1465 adcq \$0, %r8
1466
1467 mulq %rbx
1468 addq %rax, %r9
1469 movq 16(%rbp), %rax
1470 adcq \$0, %rdx
1471 addq %r9, %r8
1472 movq %rdx, %r9
1473 adcq \$0, %r9
1474
1475 mulq %rbx
1476 addq %rax, %r10
1477 movq 24(%rbp), %rax
1478 adcq \$0, %rdx
1479 addq %r10, %r9
1480 movq %rdx, %r10
1481 adcq \$0, %r10
1482
1483 mulq %rbx
1484 addq %rax, %r11
1485 movq 32(%rbp), %rax
1486 adcq \$0, %rdx
1487 addq %r11, %r10
1488 movq 128+8(%rsp), %rsi
1489 #movq %rdx, %r11
1490 #adcq \$0, %r11
1491 adcq \$0, %rdx
1492 movq %rdx, %r11
1493
1494 mulq %rbx
1495 addq %rax, %r12
1496 movq 40(%rbp), %rax
1497 adcq \$0, %rdx
1498 imulq %r8, %rsi
1499 addq %r12, %r11
1500 movq %rdx, %r12
1501 adcq \$0, %r12
1502
1503 mulq %rbx
1504 addq %rax, %r13
1505 movq 48(%rbp), %rax
1506 adcq \$0, %rdx
1507 addq %r13, %r12
1508 movq %rdx, %r13
1509 adcq \$0, %r13
1510
1511 mulq %rbx
1512 addq %rax, %r14
1513 movq 56(%rbp), %rax
1514 adcq \$0, %rdx
1515 addq %r14, %r13
1516 movq %rdx, %r14
1517 adcq \$0, %r14
1518
1519 mulq %rbx
1520 movq %rsi, %rbx
1521 addq %rax, %r15
1522 movq 0(%rbp), %rax
1523 adcq \$0, %rdx
1524 addq %r15, %r14
1525 movq %rdx, %r15
1526 adcq \$0, %r15
1527
1528 decl %ecx
1529 jne .Lreduction_loop
1530
1531 ret
1532 .size __rsaz_512_reduce,.-__rsaz_512_reduce
1533 ___
1534 }
1535 if ($addx) {
1536 # __rsaz_512_reducex
1537 #
1538 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1539 # output: %r8-%r15
1540 # clobbers: everything except %rbp and %rdi
1541 $code.=<<___;
1542 .type __rsaz_512_reducex,\@abi-omnipotent
1543 .align 32
1544 __rsaz_512_reducex:
1545 #movq 128+8(%rsp), %rdx # pull $n0
1546 imulq %r8, %rdx
1547 xorq %rsi, %rsi # cf=0,of=0
1548 movl \$8, %ecx
1549 jmp .Lreduction_loopx
1550
1551 .align 32
1552 .Lreduction_loopx:
1553 mov %r8, %rbx
1554 mulx 0(%rbp), %rax, %r8
1555 adcx %rbx, %rax
1556 adox %r9, %r8
1557
1558 mulx 8(%rbp), %rax, %r9
1559 adcx %rax, %r8
1560 adox %r10, %r9
1561
1562 mulx 16(%rbp), %rbx, %r10
1563 adcx %rbx, %r9
1564 adox %r11, %r10
1565
1566 mulx 24(%rbp), %rbx, %r11
1567 adcx %rbx, %r10
1568 adox %r12, %r11
1569
1570 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1571 mov %rdx, %rax
1572 mov %r8, %rdx
1573 adcx %rbx, %r11
1574 adox %r13, %r12
1575
1576 mulx 128+8(%rsp), %rbx, %rdx
1577 mov %rax, %rdx
1578
1579 mulx 40(%rbp), %rax, %r13
1580 adcx %rax, %r12
1581 adox %r14, %r13
1582
1583 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1584 adcx %rax, %r13
1585 adox %r15, %r14
1586
1587 mulx 56(%rbp), %rax, %r15
1588 mov %rbx, %rdx
1589 adcx %rax, %r14
1590 adox %rsi, %r15 # %rsi is 0
1591 adcx %rsi, %r15 # cf=0
1592
1593 decl %ecx # of=0
1594 jne .Lreduction_loopx
1595
1596 ret
1597 .size __rsaz_512_reducex,.-__rsaz_512_reducex
1598 ___
1599 }
1600 { # __rsaz_512_subtract
1601 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1602 # output:
1603 # clobbers: everything but %rdi, %rsi and %rbp
1604 $code.=<<___;
1605 .type __rsaz_512_subtract,\@abi-omnipotent
1606 .align 32
1607 __rsaz_512_subtract:
1608 movq %r8, ($out)
1609 movq %r9, 8($out)
1610 movq %r10, 16($out)
1611 movq %r11, 24($out)
1612 movq %r12, 32($out)
1613 movq %r13, 40($out)
1614 movq %r14, 48($out)
1615 movq %r15, 56($out)
1616
1617 movq 0($mod), %r8
1618 movq 8($mod), %r9
1619 negq %r8
1620 notq %r9
1621 andq %rcx, %r8
1622 movq 16($mod), %r10
1623 andq %rcx, %r9
1624 notq %r10
1625 movq 24($mod), %r11
1626 andq %rcx, %r10
1627 notq %r11
1628 movq 32($mod), %r12
1629 andq %rcx, %r11
1630 notq %r12
1631 movq 40($mod), %r13
1632 andq %rcx, %r12
1633 notq %r13
1634 movq 48($mod), %r14
1635 andq %rcx, %r13
1636 notq %r14
1637 movq 56($mod), %r15
1638 andq %rcx, %r14
1639 notq %r15
1640 andq %rcx, %r15
1641
1642 addq ($out), %r8
1643 adcq 8($out), %r9
1644 adcq 16($out), %r10
1645 adcq 24($out), %r11
1646 adcq 32($out), %r12
1647 adcq 40($out), %r13
1648 adcq 48($out), %r14
1649 adcq 56($out), %r15
1650
1651 movq %r8, ($out)
1652 movq %r9, 8($out)
1653 movq %r10, 16($out)
1654 movq %r11, 24($out)
1655 movq %r12, 32($out)
1656 movq %r13, 40($out)
1657 movq %r14, 48($out)
1658 movq %r15, 56($out)
1659
1660 ret
1661 .size __rsaz_512_subtract,.-__rsaz_512_subtract
1662 ___
1663 }
1664 { # __rsaz_512_mul
1665 #
1666 # input: %rsi - ap, %rbp - bp
1667 # ouput:
1668 # clobbers: everything
1669 my ($ap,$bp) = ("%rsi","%rbp");
1670 $code.=<<___;
1671 .type __rsaz_512_mul,\@abi-omnipotent
1672 .align 32
1673 __rsaz_512_mul:
1674 leaq 8(%rsp), %rdi
1675
1676 movq ($ap), %rax
1677 mulq %rbx
1678 movq %rax, (%rdi)
1679 movq 8($ap), %rax
1680 movq %rdx, %r8
1681
1682 mulq %rbx
1683 addq %rax, %r8
1684 movq 16($ap), %rax
1685 movq %rdx, %r9
1686 adcq \$0, %r9
1687
1688 mulq %rbx
1689 addq %rax, %r9
1690 movq 24($ap), %rax
1691 movq %rdx, %r10
1692 adcq \$0, %r10
1693
1694 mulq %rbx
1695 addq %rax, %r10
1696 movq 32($ap), %rax
1697 movq %rdx, %r11
1698 adcq \$0, %r11
1699
1700 mulq %rbx
1701 addq %rax, %r11
1702 movq 40($ap), %rax
1703 movq %rdx, %r12
1704 adcq \$0, %r12
1705
1706 mulq %rbx
1707 addq %rax, %r12
1708 movq 48($ap), %rax
1709 movq %rdx, %r13
1710 adcq \$0, %r13
1711
1712 mulq %rbx
1713 addq %rax, %r13
1714 movq 56($ap), %rax
1715 movq %rdx, %r14
1716 adcq \$0, %r14
1717
1718 mulq %rbx
1719 addq %rax, %r14
1720 movq ($ap), %rax
1721 movq %rdx, %r15
1722 adcq \$0, %r15
1723
1724 leaq 8($bp), $bp
1725 leaq 8(%rdi), %rdi
1726
1727 movl \$7, %ecx
1728 jmp .Loop_mul
1729
1730 .align 32
1731 .Loop_mul:
1732 movq ($bp), %rbx
1733 mulq %rbx
1734 addq %rax, %r8
1735 movq 8($ap), %rax
1736 movq %r8, (%rdi)
1737 movq %rdx, %r8
1738 adcq \$0, %r8
1739
1740 mulq %rbx
1741 addq %rax, %r9
1742 movq 16($ap), %rax
1743 adcq \$0, %rdx
1744 addq %r9, %r8
1745 movq %rdx, %r9
1746 adcq \$0, %r9
1747
1748 mulq %rbx
1749 addq %rax, %r10
1750 movq 24($ap), %rax
1751 adcq \$0, %rdx
1752 addq %r10, %r9
1753 movq %rdx, %r10
1754 adcq \$0, %r10
1755
1756 mulq %rbx
1757 addq %rax, %r11
1758 movq 32($ap), %rax
1759 adcq \$0, %rdx
1760 addq %r11, %r10
1761 movq %rdx, %r11
1762 adcq \$0, %r11
1763
1764 mulq %rbx
1765 addq %rax, %r12
1766 movq 40($ap), %rax
1767 adcq \$0, %rdx
1768 addq %r12, %r11
1769 movq %rdx, %r12
1770 adcq \$0, %r12
1771
1772 mulq %rbx
1773 addq %rax, %r13
1774 movq 48($ap), %rax
1775 adcq \$0, %rdx
1776 addq %r13, %r12
1777 movq %rdx, %r13
1778 adcq \$0, %r13
1779
1780 mulq %rbx
1781 addq %rax, %r14
1782 movq 56($ap), %rax
1783 adcq \$0, %rdx
1784 addq %r14, %r13
1785 movq %rdx, %r14
1786 leaq 8($bp), $bp
1787 adcq \$0, %r14
1788
1789 mulq %rbx
1790 addq %rax, %r15
1791 movq ($ap), %rax
1792 adcq \$0, %rdx
1793 addq %r15, %r14
1794 movq %rdx, %r15
1795 adcq \$0, %r15
1796
1797 leaq 8(%rdi), %rdi
1798
1799 decl %ecx
1800 jnz .Loop_mul
1801
1802 movq %r8, (%rdi)
1803 movq %r9, 8(%rdi)
1804 movq %r10, 16(%rdi)
1805 movq %r11, 24(%rdi)
1806 movq %r12, 32(%rdi)
1807 movq %r13, 40(%rdi)
1808 movq %r14, 48(%rdi)
1809 movq %r15, 56(%rdi)
1810
1811 ret
1812 .size __rsaz_512_mul,.-__rsaz_512_mul
1813 ___
1814 }
1815 if ($addx) {
1816 # __rsaz_512_mulx
1817 #
1818 # input: %rsi - ap, %rbp - bp
1819 # ouput:
1820 # clobbers: everything
1821 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1822 $code.=<<___;
1823 .type __rsaz_512_mulx,\@abi-omnipotent
1824 .align 32
1825 __rsaz_512_mulx:
1826 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
1827 mov \$-6, %rcx
1828
1829 mulx 8($ap), %rax, %r9
1830 movq %rbx, 8(%rsp)
1831
1832 mulx 16($ap), %rbx, %r10
1833 adc %rax, %r8
1834
1835 mulx 24($ap), %rax, %r11
1836 adc %rbx, %r9
1837
1838 mulx 32($ap), %rbx, %r12
1839 adc %rax, %r10
1840
1841 mulx 40($ap), %rax, %r13
1842 adc %rbx, %r11
1843
1844 mulx 48($ap), %rbx, %r14
1845 adc %rax, %r12
1846
1847 mulx 56($ap), %rax, %r15
1848 mov 8($bp), %rdx
1849 adc %rbx, %r13
1850 adc %rax, %r14
1851 adc \$0, %r15
1852
1853 xor $zero, $zero # cf=0,of=0
1854 jmp .Loop_mulx
1855
1856 .align 32
1857 .Loop_mulx:
1858 movq %r8, %rbx
1859 mulx ($ap), %rax, %r8
1860 adcx %rax, %rbx
1861 adox %r9, %r8
1862
1863 mulx 8($ap), %rax, %r9
1864 adcx %rax, %r8
1865 adox %r10, %r9
1866
1867 mulx 16($ap), %rax, %r10
1868 adcx %rax, %r9
1869 adox %r11, %r10
1870
1871 mulx 24($ap), %rax, %r11
1872 adcx %rax, %r10
1873 adox %r12, %r11
1874
1875 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1876 adcx %rax, %r11
1877 adox %r13, %r12
1878
1879 mulx 40($ap), %rax, %r13
1880 adcx %rax, %r12
1881 adox %r14, %r13
1882
1883 mulx 48($ap), %rax, %r14
1884 adcx %rax, %r13
1885 adox %r15, %r14
1886
1887 mulx 56($ap), %rax, %r15
1888 movq 64($bp,%rcx,8), %rdx
1889 movq %rbx, 8+64-8(%rsp,%rcx,8)
1890 adcx %rax, %r14
1891 adox $zero, %r15
1892 adcx $zero, %r15 # cf=0
1893
1894 inc %rcx # of=0
1895 jnz .Loop_mulx
1896
1897 movq %r8, %rbx
1898 mulx ($ap), %rax, %r8
1899 adcx %rax, %rbx
1900 adox %r9, %r8
1901
1902 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
1903 adcx %rax, %r8
1904 adox %r10, %r9
1905
1906 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
1907 adcx %rax, %r9
1908 adox %r11, %r10
1909
1910 mulx 24($ap), %rax, %r11
1911 adcx %rax, %r10
1912 adox %r12, %r11
1913
1914 mulx 32($ap), %rax, %r12
1915 adcx %rax, %r11
1916 adox %r13, %r12
1917
1918 mulx 40($ap), %rax, %r13
1919 adcx %rax, %r12
1920 adox %r14, %r13
1921
1922 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1923 adcx %rax, %r13
1924 adox %r15, %r14
1925
1926 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
1927 adcx %rax, %r14
1928 adox $zero, %r15
1929 adcx $zero, %r15
1930
1931 mov %rbx, 8+64-8(%rsp)
1932 mov %r8, 8+64(%rsp)
1933 mov %r9, 8+64+8(%rsp)
1934 mov %r10, 8+64+16(%rsp)
1935 mov %r11, 8+64+24(%rsp)
1936 mov %r12, 8+64+32(%rsp)
1937 mov %r13, 8+64+40(%rsp)
1938 mov %r14, 8+64+48(%rsp)
1939 mov %r15, 8+64+56(%rsp)
1940
1941 ret
1942 .size __rsaz_512_mulx,.-__rsaz_512_mulx
1943 ___
1944 }
1945 {
1946 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1947 $code.=<<___;
1948 .globl rsaz_512_scatter4
1949 .type rsaz_512_scatter4,\@abi-omnipotent
1950 .align 16
1951 rsaz_512_scatter4:
1952 leaq ($out,$power,4), $out
1953 movl \$8, %r9d
1954 jmp .Loop_scatter
1955 .align 16
1956 .Loop_scatter:
1957 movq ($inp), %rax
1958 leaq 8($inp), $inp
1959 movl %eax, ($out)
1960 shrq \$32, %rax
1961 movl %eax, 64($out)
1962 leaq 128($out), $out
1963 decl %r9d
1964 jnz .Loop_scatter
1965 ret
1966 .size rsaz_512_scatter4,.-rsaz_512_scatter4
1967
1968 .globl rsaz_512_gather4
1969 .type rsaz_512_gather4,\@abi-omnipotent
1970 .align 16
1971 rsaz_512_gather4:
1972 leaq ($inp,$power,4), $inp
1973 movl \$8, %r9d
1974 jmp .Loop_gather
1975 .align 16
1976 .Loop_gather:
1977 movl ($inp), %eax
1978 movl 64($inp), %r8d
1979 leaq 128($inp), $inp
1980 shlq \$32, %r8
1981 or %r8, %rax
1982 movq %rax, ($out)
1983 leaq 8($out), $out
1984 decl %r9d
1985 jnz .Loop_gather
1986 ret
1987 .size rsaz_512_gather4,.-rsaz_512_gather4
1988 ___
1989 }
1990
1991 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1992 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1993 if ($win64) {
1994 $rec="%rcx";
1995 $frame="%rdx";
1996 $context="%r8";
1997 $disp="%r9";
1998
1999 $code.=<<___;
2000 .extern __imp_RtlVirtualUnwind
2001 .type se_handler,\@abi-omnipotent
2002 .align 16
2003 se_handler:
2004 push %rsi
2005 push %rdi
2006 push %rbx
2007 push %rbp
2008 push %r12
2009 push %r13
2010 push %r14
2011 push %r15
2012 pushfq
2013 sub \$64,%rsp
2014
2015 mov 120($context),%rax # pull context->Rax
2016 mov 248($context),%rbx # pull context->Rip
2017
2018 mov 8($disp),%rsi # disp->ImageBase
2019 mov 56($disp),%r11 # disp->HandlerData
2020
2021 mov 0(%r11),%r10d # HandlerData[0]
2022 lea (%rsi,%r10),%r10 # end of prologue label
2023 cmp %r10,%rbx # context->Rip<end of prologue label
2024 jb .Lcommon_seh_tail
2025
2026 mov 152($context),%rax # pull context->Rsp
2027
2028 mov 4(%r11),%r10d # HandlerData[1]
2029 lea (%rsi,%r10),%r10 # epilogue label
2030 cmp %r10,%rbx # context->Rip>=epilogue label
2031 jae .Lcommon_seh_tail
2032
2033 lea 128+24+48(%rax),%rax
2034
2035 mov -8(%rax),%rbx
2036 mov -16(%rax),%rbp
2037 mov -24(%rax),%r12
2038 mov -32(%rax),%r13
2039 mov -40(%rax),%r14
2040 mov -48(%rax),%r15
2041 mov %rbx,144($context) # restore context->Rbx
2042 mov %rbp,160($context) # restore context->Rbp
2043 mov %r12,216($context) # restore context->R12
2044 mov %r13,224($context) # restore context->R13
2045 mov %r14,232($context) # restore context->R14
2046 mov %r15,240($context) # restore context->R15
2047
2048 .Lcommon_seh_tail:
2049 mov 8(%rax),%rdi
2050 mov 16(%rax),%rsi
2051 mov %rax,152($context) # restore context->Rsp
2052 mov %rsi,168($context) # restore context->Rsi
2053 mov %rdi,176($context) # restore context->Rdi
2054
2055 mov 40($disp),%rdi # disp->ContextRecord
2056 mov $context,%rsi # context
2057 mov \$154,%ecx # sizeof(CONTEXT)
2058 .long 0xa548f3fc # cld; rep movsq
2059
2060 mov $disp,%rsi
2061 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2062 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2063 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2064 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2065 mov 40(%rsi),%r10 # disp->ContextRecord
2066 lea 56(%rsi),%r11 # &disp->HandlerData
2067 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2068 mov %r10,32(%rsp) # arg5
2069 mov %r11,40(%rsp) # arg6
2070 mov %r12,48(%rsp) # arg7
2071 mov %rcx,56(%rsp) # arg8, (NULL)
2072 call *__imp_RtlVirtualUnwind(%rip)
2073
2074 mov \$1,%eax # ExceptionContinueSearch
2075 add \$64,%rsp
2076 popfq
2077 pop %r15
2078 pop %r14
2079 pop %r13
2080 pop %r12
2081 pop %rbp
2082 pop %rbx
2083 pop %rdi
2084 pop %rsi
2085 ret
2086 .size sqr_handler,.-sqr_handler
2087
2088 .section .pdata
2089 .align 4
2090 .rva .LSEH_begin_rsaz_512_sqr
2091 .rva .LSEH_end_rsaz_512_sqr
2092 .rva .LSEH_info_rsaz_512_sqr
2093
2094 .rva .LSEH_begin_rsaz_512_mul
2095 .rva .LSEH_end_rsaz_512_mul
2096 .rva .LSEH_info_rsaz_512_mul
2097
2098 .rva .LSEH_begin_rsaz_512_mul_gather4
2099 .rva .LSEH_end_rsaz_512_mul_gather4
2100 .rva .LSEH_info_rsaz_512_mul_gather4
2101
2102 .rva .LSEH_begin_rsaz_512_mul_scatter4
2103 .rva .LSEH_end_rsaz_512_mul_scatter4
2104 .rva .LSEH_info_rsaz_512_mul_scatter4
2105
2106 .rva .LSEH_begin_rsaz_512_mul_by_one
2107 .rva .LSEH_end_rsaz_512_mul_by_one
2108 .rva .LSEH_info_rsaz_512_mul_by_one
2109
2110 .section .xdata
2111 .align 8
2112 .LSEH_info_rsaz_512_sqr:
2113 .byte 9,0,0,0
2114 .rva se_handler
2115 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2116 .LSEH_info_rsaz_512_mul:
2117 .byte 9,0,0,0
2118 .rva se_handler
2119 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2120 .LSEH_info_rsaz_512_mul_gather4:
2121 .byte 9,0,0,0
2122 .rva se_handler
2123 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2124 .LSEH_info_rsaz_512_mul_scatter4:
2125 .byte 9,0,0,0
2126 .rva se_handler
2127 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2128 .LSEH_info_rsaz_512_mul_by_one:
2129 .byte 9,0,0,0
2130 .rva se_handler
2131 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2132 ___
2133 }
2134
2135 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2136 print $code;
2137 close STDOUT;