]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/rsaz-x86_64.pl
x86_64 assembly pack: refine clang detection.
[thirdparty/openssl.git] / crypto / bn / asm / rsaz-x86_64.pl
1 #!/usr/bin/env perl
2
3 ##############################################################################
4 # #
5 # Copyright (c) 2012, Intel Corporation #
6 # #
7 # All rights reserved. #
8 # #
9 # Redistribution and use in source and binary forms, with or without #
10 # modification, are permitted provided that the following conditions are #
11 # met: #
12 # #
13 # * Redistributions of source code must retain the above copyright #
14 # notice, this list of conditions and the following disclaimer. #
15 # #
16 # * Redistributions in binary form must reproduce the above copyright #
17 # notice, this list of conditions and the following disclaimer in the #
18 # documentation and/or other materials provided with the #
19 # distribution. #
20 # #
21 # * Neither the name of the Intel Corporation nor the names of its #
22 # contributors may be used to endorse or promote products derived from #
23 # this software without specific prior written permission. #
24 # #
25 # #
26 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
37 # #
38 ##############################################################################
39 # Developers and authors: #
40 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
41 # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
42 # Israel Development Center, Haifa, Israel #
43 # (2) University of Haifa #
44 ##############################################################################
45 # Reference: #
46 # [1] S. Gueron, "Efficient Software Implementations of Modular #
47 # Exponentiation", http://eprint.iacr.org/2011/239 #
48 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
49 # IEEE Proceedings of 9th International Conference on Information #
50 # Technology: New Generations (ITNG 2012), 821-823 (2012). #
51 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
52 # Journal of Cryptographic Engineering 2:31-43 (2012). #
53 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
54 # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
55 # RSA1024 and RSA2048 on x86_64 platforms", #
56 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
57 ##############################################################################
58
59 # While original submission covers 512- and 1024-bit exponentiation,
60 # this module is limited to 512-bit version only (and as such
61 # accelerates RSA1024 sign). This is because improvement for longer
62 # keys is not high enough to justify the effort, highest measured
63 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
64 # for the moment of this writing!] Nor does this module implement
65 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
66 # to more modular mixture of C and assembly. And it's optimized even
67 # for processors other than Intel Core family (see table below for
68 # improvement coefficients).
69 # <appro@openssl.org>
70 #
71 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
72 # ----------------+---------------------------
73 # Opteron +13% |+5% +20%
74 # Bulldozer -0% |-1% +10%
75 # P4 +11% |+7% +8%
76 # Westmere +5% |+14% +17%
77 # Sandy Bridge +2% |+12% +29%
78 # Ivy Bridge +1% |+11% +35%
79 # Haswell(**) -0% |+12% +39%
80 # Atom +13% |+11% +4%
81 # VIA Nano +70% |+9% +25%
82 #
83 # (*) rsax engine and fips numbers are presented for reference
84 # purposes;
85 # (**) MULX was attempted, but found to give only marginal improvement;
86
87 $flavour = shift;
88 $output = shift;
89 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
90
91 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
92
93 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96 die "can't locate x86_64-xlate.pl";
97
98 open OUT,"| $^X $xlate $flavour $output";
99 *STDOUT=*OUT;
100
101 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
103 $addx = ($1>=2.23);
104 }
105
106 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
107 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
108 $addx = ($1>=2.10);
109 }
110
111 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
112 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
113 $addx = ($1>=11);
114 }
115
116 if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
117 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
118 $addx = ($ver>=3.03);
119 }
120
121 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
122 {
123 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
124
125 $code.=<<___;
126 .text
127
128 .extern OPENSSL_ia32cap_P
129
130 .globl rsaz_512_sqr
131 .type rsaz_512_sqr,\@function,5
132 .align 32
133 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
134 push %rbx
135 push %rbp
136 push %r12
137 push %r13
138 push %r14
139 push %r15
140
141 subq \$128+24, %rsp
142 .Lsqr_body:
143 movq $mod, %rbp # common argument
144 movq ($inp), %rdx
145 movq 8($inp), %rax
146 movq $n0, 128(%rsp)
147 ___
148 $code.=<<___ if ($addx);
149 movl \$0x80100,%r11d
150 andl OPENSSL_ia32cap_P+8(%rip),%r11d
151 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
152 je .Loop_sqrx
153 ___
154 $code.=<<___;
155 jmp .Loop_sqr
156
157 .align 32
158 .Loop_sqr:
159 movl $times,128+8(%rsp)
160 #first iteration
161 movq %rdx, %rbx
162 mulq %rdx
163 movq %rax, %r8
164 movq 16($inp), %rax
165 movq %rdx, %r9
166
167 mulq %rbx
168 addq %rax, %r9
169 movq 24($inp), %rax
170 movq %rdx, %r10
171 adcq \$0, %r10
172
173 mulq %rbx
174 addq %rax, %r10
175 movq 32($inp), %rax
176 movq %rdx, %r11
177 adcq \$0, %r11
178
179 mulq %rbx
180 addq %rax, %r11
181 movq 40($inp), %rax
182 movq %rdx, %r12
183 adcq \$0, %r12
184
185 mulq %rbx
186 addq %rax, %r12
187 movq 48($inp), %rax
188 movq %rdx, %r13
189 adcq \$0, %r13
190
191 mulq %rbx
192 addq %rax, %r13
193 movq 56($inp), %rax
194 movq %rdx, %r14
195 adcq \$0, %r14
196
197 mulq %rbx
198 addq %rax, %r14
199 movq %rbx, %rax
200 movq %rdx, %r15
201 adcq \$0, %r15
202
203 addq %r8, %r8 #shlq \$1, %r8
204 movq %r9, %rcx
205 adcq %r9, %r9 #shld \$1, %r8, %r9
206
207 mulq %rax
208 movq %rax, (%rsp)
209 addq %rdx, %r8
210 adcq \$0, %r9
211
212 movq %r8, 8(%rsp)
213 shrq \$63, %rcx
214
215 #second iteration
216 movq 8($inp), %r8
217 movq 16($inp), %rax
218 mulq %r8
219 addq %rax, %r10
220 movq 24($inp), %rax
221 movq %rdx, %rbx
222 adcq \$0, %rbx
223
224 mulq %r8
225 addq %rax, %r11
226 movq 32($inp), %rax
227 adcq \$0, %rdx
228 addq %rbx, %r11
229 movq %rdx, %rbx
230 adcq \$0, %rbx
231
232 mulq %r8
233 addq %rax, %r12
234 movq 40($inp), %rax
235 adcq \$0, %rdx
236 addq %rbx, %r12
237 movq %rdx, %rbx
238 adcq \$0, %rbx
239
240 mulq %r8
241 addq %rax, %r13
242 movq 48($inp), %rax
243 adcq \$0, %rdx
244 addq %rbx, %r13
245 movq %rdx, %rbx
246 adcq \$0, %rbx
247
248 mulq %r8
249 addq %rax, %r14
250 movq 56($inp), %rax
251 adcq \$0, %rdx
252 addq %rbx, %r14
253 movq %rdx, %rbx
254 adcq \$0, %rbx
255
256 mulq %r8
257 addq %rax, %r15
258 movq %r8, %rax
259 adcq \$0, %rdx
260 addq %rbx, %r15
261 movq %rdx, %r8
262 movq %r10, %rdx
263 adcq \$0, %r8
264
265 add %rdx, %rdx
266 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
267 movq %r11, %rbx
268 adcq %r11, %r11 #shld \$1, %r10, %r11
269
270 mulq %rax
271 addq %rax, %r9
272 adcq %rdx, %r10
273 adcq \$0, %r11
274
275 movq %r9, 16(%rsp)
276 movq %r10, 24(%rsp)
277 shrq \$63, %rbx
278
279 #third iteration
280 movq 16($inp), %r9
281 movq 24($inp), %rax
282 mulq %r9
283 addq %rax, %r12
284 movq 32($inp), %rax
285 movq %rdx, %rcx
286 adcq \$0, %rcx
287
288 mulq %r9
289 addq %rax, %r13
290 movq 40($inp), %rax
291 adcq \$0, %rdx
292 addq %rcx, %r13
293 movq %rdx, %rcx
294 adcq \$0, %rcx
295
296 mulq %r9
297 addq %rax, %r14
298 movq 48($inp), %rax
299 adcq \$0, %rdx
300 addq %rcx, %r14
301 movq %rdx, %rcx
302 adcq \$0, %rcx
303
304 mulq %r9
305 movq %r12, %r10
306 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
307 addq %rax, %r15
308 movq 56($inp), %rax
309 adcq \$0, %rdx
310 addq %rcx, %r15
311 movq %rdx, %rcx
312 adcq \$0, %rcx
313
314 mulq %r9
315 shrq \$63, %r10
316 addq %rax, %r8
317 movq %r9, %rax
318 adcq \$0, %rdx
319 addq %rcx, %r8
320 movq %rdx, %r9
321 adcq \$0, %r9
322
323 movq %r13, %rcx
324 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
325
326 mulq %rax
327 addq %rax, %r11
328 adcq %rdx, %r12
329 adcq \$0, %r13
330
331 movq %r11, 32(%rsp)
332 movq %r12, 40(%rsp)
333 shrq \$63, %rcx
334
335 #fourth iteration
336 movq 24($inp), %r10
337 movq 32($inp), %rax
338 mulq %r10
339 addq %rax, %r14
340 movq 40($inp), %rax
341 movq %rdx, %rbx
342 adcq \$0, %rbx
343
344 mulq %r10
345 addq %rax, %r15
346 movq 48($inp), %rax
347 adcq \$0, %rdx
348 addq %rbx, %r15
349 movq %rdx, %rbx
350 adcq \$0, %rbx
351
352 mulq %r10
353 movq %r14, %r12
354 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
355 addq %rax, %r8
356 movq 56($inp), %rax
357 adcq \$0, %rdx
358 addq %rbx, %r8
359 movq %rdx, %rbx
360 adcq \$0, %rbx
361
362 mulq %r10
363 shrq \$63, %r12
364 addq %rax, %r9
365 movq %r10, %rax
366 adcq \$0, %rdx
367 addq %rbx, %r9
368 movq %rdx, %r10
369 adcq \$0, %r10
370
371 movq %r15, %rbx
372 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
373
374 mulq %rax
375 addq %rax, %r13
376 adcq %rdx, %r14
377 adcq \$0, %r15
378
379 movq %r13, 48(%rsp)
380 movq %r14, 56(%rsp)
381 shrq \$63, %rbx
382
383 #fifth iteration
384 movq 32($inp), %r11
385 movq 40($inp), %rax
386 mulq %r11
387 addq %rax, %r8
388 movq 48($inp), %rax
389 movq %rdx, %rcx
390 adcq \$0, %rcx
391
392 mulq %r11
393 addq %rax, %r9
394 movq 56($inp), %rax
395 adcq \$0, %rdx
396 movq %r8, %r12
397 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
398 addq %rcx, %r9
399 movq %rdx, %rcx
400 adcq \$0, %rcx
401
402 mulq %r11
403 shrq \$63, %r12
404 addq %rax, %r10
405 movq %r11, %rax
406 adcq \$0, %rdx
407 addq %rcx, %r10
408 movq %rdx, %r11
409 adcq \$0, %r11
410
411 movq %r9, %rcx
412 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
413
414 mulq %rax
415 addq %rax, %r15
416 adcq %rdx, %r8
417 adcq \$0, %r9
418
419 movq %r15, 64(%rsp)
420 movq %r8, 72(%rsp)
421 shrq \$63, %rcx
422
423 #sixth iteration
424 movq 40($inp), %r12
425 movq 48($inp), %rax
426 mulq %r12
427 addq %rax, %r10
428 movq 56($inp), %rax
429 movq %rdx, %rbx
430 adcq \$0, %rbx
431
432 mulq %r12
433 addq %rax, %r11
434 movq %r12, %rax
435 movq %r10, %r15
436 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
437 adcq \$0, %rdx
438 shrq \$63, %r15
439 addq %rbx, %r11
440 movq %rdx, %r12
441 adcq \$0, %r12
442
443 movq %r11, %rbx
444 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
445
446 mulq %rax
447 addq %rax, %r9
448 adcq %rdx, %r10
449 adcq \$0, %r11
450
451 movq %r9, 80(%rsp)
452 movq %r10, 88(%rsp)
453
454 #seventh iteration
455 movq 48($inp), %r13
456 movq 56($inp), %rax
457 mulq %r13
458 addq %rax, %r12
459 movq %r13, %rax
460 movq %rdx, %r13
461 adcq \$0, %r13
462
463 xorq %r14, %r14
464 shlq \$1, %rbx
465 adcq %r12, %r12 #shld \$1, %rbx, %r12
466 adcq %r13, %r13 #shld \$1, %r12, %r13
467 adcq %r14, %r14 #shld \$1, %r13, %r14
468
469 mulq %rax
470 addq %rax, %r11
471 adcq %rdx, %r12
472 adcq \$0, %r13
473
474 movq %r11, 96(%rsp)
475 movq %r12, 104(%rsp)
476
477 #eighth iteration
478 movq 56($inp), %rax
479 mulq %rax
480 addq %rax, %r13
481 adcq \$0, %rdx
482
483 addq %rdx, %r14
484
485 movq %r13, 112(%rsp)
486 movq %r14, 120(%rsp)
487
488 movq (%rsp), %r8
489 movq 8(%rsp), %r9
490 movq 16(%rsp), %r10
491 movq 24(%rsp), %r11
492 movq 32(%rsp), %r12
493 movq 40(%rsp), %r13
494 movq 48(%rsp), %r14
495 movq 56(%rsp), %r15
496
497 call __rsaz_512_reduce
498
499 addq 64(%rsp), %r8
500 adcq 72(%rsp), %r9
501 adcq 80(%rsp), %r10
502 adcq 88(%rsp), %r11
503 adcq 96(%rsp), %r12
504 adcq 104(%rsp), %r13
505 adcq 112(%rsp), %r14
506 adcq 120(%rsp), %r15
507 sbbq %rcx, %rcx
508
509 call __rsaz_512_subtract
510
511 movq %r8, %rdx
512 movq %r9, %rax
513 movl 128+8(%rsp), $times
514 movq $out, $inp
515
516 decl $times
517 jnz .Loop_sqr
518 ___
519 if ($addx) {
520 $code.=<<___;
521 jmp .Lsqr_tail
522
523 .align 32
524 .Loop_sqrx:
525 movl $times,128+8(%rsp)
526 movq $out, %xmm0 # off-load
527 movq %rbp, %xmm1 # off-load
528 #first iteration
529 mulx %rax, %r8, %r9
530
531 mulx 16($inp), %rcx, %r10
532 xor %rbp, %rbp # cf=0, of=0
533
534 mulx 24($inp), %rax, %r11
535 adcx %rcx, %r9
536
537 mulx 32($inp), %rcx, %r12
538 adcx %rax, %r10
539
540 mulx 40($inp), %rax, %r13
541 adcx %rcx, %r11
542
543 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
544 adcx %rax, %r12
545 adcx %rcx, %r13
546
547 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
548 adcx %rax, %r14
549 adcx %rbp, %r15 # %rbp is 0
550
551 mov %r9, %rcx
552 shld \$1, %r8, %r9
553 shl \$1, %r8
554
555 xor %ebp, %ebp
556 mulx %rdx, %rax, %rdx
557 adcx %rdx, %r8
558 mov 8($inp), %rdx
559 adcx %rbp, %r9
560
561 mov %rax, (%rsp)
562 mov %r8, 8(%rsp)
563
564 #second iteration
565 mulx 16($inp), %rax, %rbx
566 adox %rax, %r10
567 adcx %rbx, %r11
568
569 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
570 adox $out, %r11
571 adcx %r8, %r12
572
573 mulx 32($inp), %rax, %rbx
574 adox %rax, %r12
575 adcx %rbx, %r13
576
577 mulx 40($inp), $out, %r8
578 adox $out, %r13
579 adcx %r8, %r14
580
581 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
582 adox %rax, %r14
583 adcx %rbx, %r15
584
585 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
586 adox $out, %r15
587 adcx %rbp, %r8
588 adox %rbp, %r8
589
590 mov %r11, %rbx
591 shld \$1, %r10, %r11
592 shld \$1, %rcx, %r10
593
594 xor %ebp,%ebp
595 mulx %rdx, %rax, %rcx
596 mov 16($inp), %rdx
597 adcx %rax, %r9
598 adcx %rcx, %r10
599 adcx %rbp, %r11
600
601 mov %r9, 16(%rsp)
602 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
603
604 #third iteration
605 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
606 adox $out, %r12
607 adcx %r9, %r13
608
609 mulx 32($inp), %rax, %rcx
610 adox %rax, %r13
611 adcx %rcx, %r14
612
613 mulx 40($inp), $out, %r9
614 adox $out, %r14
615 adcx %r9, %r15
616
617 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
618 adox %rax, %r15
619 adcx %rcx, %r8
620
621 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
622 adox $out, %r8
623 adcx %rbp, %r9
624 adox %rbp, %r9
625
626 mov %r13, %rcx
627 shld \$1, %r12, %r13
628 shld \$1, %rbx, %r12
629
630 xor %ebp, %ebp
631 mulx %rdx, %rax, %rdx
632 adcx %rax, %r11
633 adcx %rdx, %r12
634 mov 24($inp), %rdx
635 adcx %rbp, %r13
636
637 mov %r11, 32(%rsp)
638 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
639
640 #fourth iteration
641 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
642 adox %rax, %r14
643 adcx %rbx, %r15
644
645 mulx 40($inp), $out, %r10
646 adox $out, %r15
647 adcx %r10, %r8
648
649 mulx 48($inp), %rax, %rbx
650 adox %rax, %r8
651 adcx %rbx, %r9
652
653 mulx 56($inp), $out, %r10
654 adox $out, %r9
655 adcx %rbp, %r10
656 adox %rbp, %r10
657
658 .byte 0x66
659 mov %r15, %rbx
660 shld \$1, %r14, %r15
661 shld \$1, %rcx, %r14
662
663 xor %ebp, %ebp
664 mulx %rdx, %rax, %rdx
665 adcx %rax, %r13
666 adcx %rdx, %r14
667 mov 32($inp), %rdx
668 adcx %rbp, %r15
669
670 mov %r13, 48(%rsp)
671 mov %r14, 56(%rsp)
672
673 #fifth iteration
674 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
675 adox $out, %r8
676 adcx %r11, %r9
677
678 mulx 48($inp), %rax, %rcx
679 adox %rax, %r9
680 adcx %rcx, %r10
681
682 mulx 56($inp), $out, %r11
683 adox $out, %r10
684 adcx %rbp, %r11
685 adox %rbp, %r11
686
687 mov %r9, %rcx
688 shld \$1, %r8, %r9
689 shld \$1, %rbx, %r8
690
691 xor %ebp, %ebp
692 mulx %rdx, %rax, %rdx
693 adcx %rax, %r15
694 adcx %rdx, %r8
695 mov 40($inp), %rdx
696 adcx %rbp, %r9
697
698 mov %r15, 64(%rsp)
699 mov %r8, 72(%rsp)
700
701 #sixth iteration
702 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
703 adox %rax, %r10
704 adcx %rbx, %r11
705
706 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
707 adox $out, %r11
708 adcx %rbp, %r12
709 adox %rbp, %r12
710
711 mov %r11, %rbx
712 shld \$1, %r10, %r11
713 shld \$1, %rcx, %r10
714
715 xor %ebp, %ebp
716 mulx %rdx, %rax, %rdx
717 adcx %rax, %r9
718 adcx %rdx, %r10
719 mov 48($inp), %rdx
720 adcx %rbp, %r11
721
722 mov %r9, 80(%rsp)
723 mov %r10, 88(%rsp)
724
725 #seventh iteration
726 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
727 adox %rax, %r12
728 adox %rbp, %r13
729
730 xor %r14, %r14
731 shld \$1, %r13, %r14
732 shld \$1, %r12, %r13
733 shld \$1, %rbx, %r12
734
735 xor %ebp, %ebp
736 mulx %rdx, %rax, %rdx
737 adcx %rax, %r11
738 adcx %rdx, %r12
739 mov 56($inp), %rdx
740 adcx %rbp, %r13
741
742 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
743 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
744
745 #eighth iteration
746 mulx %rdx, %rax, %rdx
747 adox %rax, %r13
748 adox %rbp, %rdx
749
750 .byte 0x66
751 add %rdx, %r14
752
753 movq %r13, 112(%rsp)
754 movq %r14, 120(%rsp)
755 movq %xmm0, $out
756 movq %xmm1, %rbp
757
758 movq 128(%rsp), %rdx # pull $n0
759 movq (%rsp), %r8
760 movq 8(%rsp), %r9
761 movq 16(%rsp), %r10
762 movq 24(%rsp), %r11
763 movq 32(%rsp), %r12
764 movq 40(%rsp), %r13
765 movq 48(%rsp), %r14
766 movq 56(%rsp), %r15
767
768 call __rsaz_512_reducex
769
770 addq 64(%rsp), %r8
771 adcq 72(%rsp), %r9
772 adcq 80(%rsp), %r10
773 adcq 88(%rsp), %r11
774 adcq 96(%rsp), %r12
775 adcq 104(%rsp), %r13
776 adcq 112(%rsp), %r14
777 adcq 120(%rsp), %r15
778 sbbq %rcx, %rcx
779
780 call __rsaz_512_subtract
781
782 movq %r8, %rdx
783 movq %r9, %rax
784 movl 128+8(%rsp), $times
785 movq $out, $inp
786
787 decl $times
788 jnz .Loop_sqrx
789
790 .Lsqr_tail:
791 ___
792 }
793 $code.=<<___;
794
795 leaq 128+24+48(%rsp), %rax
796 movq -48(%rax), %r15
797 movq -40(%rax), %r14
798 movq -32(%rax), %r13
799 movq -24(%rax), %r12
800 movq -16(%rax), %rbp
801 movq -8(%rax), %rbx
802 leaq (%rax), %rsp
803 .Lsqr_epilogue:
804 ret
805 .size rsaz_512_sqr,.-rsaz_512_sqr
806 ___
807 }
808 {
809 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
810 $code.=<<___;
811 .globl rsaz_512_mul
812 .type rsaz_512_mul,\@function,5
813 .align 32
814 rsaz_512_mul:
815 push %rbx
816 push %rbp
817 push %r12
818 push %r13
819 push %r14
820 push %r15
821
822 subq \$128+24, %rsp
823 .Lmul_body:
824 movq $out, %xmm0 # off-load arguments
825 movq $mod, %xmm1
826 movq $n0, 128(%rsp)
827 ___
828 $code.=<<___ if ($addx);
829 movl \$0x80100,%r11d
830 andl OPENSSL_ia32cap_P+8(%rip),%r11d
831 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
832 je .Lmulx
833 ___
834 $code.=<<___;
835 movq ($bp), %rbx # pass b[0]
836 movq $bp, %rbp # pass argument
837 call __rsaz_512_mul
838
839 movq %xmm0, $out
840 movq %xmm1, %rbp
841
842 movq (%rsp), %r8
843 movq 8(%rsp), %r9
844 movq 16(%rsp), %r10
845 movq 24(%rsp), %r11
846 movq 32(%rsp), %r12
847 movq 40(%rsp), %r13
848 movq 48(%rsp), %r14
849 movq 56(%rsp), %r15
850
851 call __rsaz_512_reduce
852 ___
853 $code.=<<___ if ($addx);
854 jmp .Lmul_tail
855
856 .align 32
857 .Lmulx:
858 movq $bp, %rbp # pass argument
859 movq ($bp), %rdx # pass b[0]
860 call __rsaz_512_mulx
861
862 movq %xmm0, $out
863 movq %xmm1, %rbp
864
865 movq 128(%rsp), %rdx # pull $n0
866 movq (%rsp), %r8
867 movq 8(%rsp), %r9
868 movq 16(%rsp), %r10
869 movq 24(%rsp), %r11
870 movq 32(%rsp), %r12
871 movq 40(%rsp), %r13
872 movq 48(%rsp), %r14
873 movq 56(%rsp), %r15
874
875 call __rsaz_512_reducex
876 .Lmul_tail:
877 ___
878 $code.=<<___;
879 addq 64(%rsp), %r8
880 adcq 72(%rsp), %r9
881 adcq 80(%rsp), %r10
882 adcq 88(%rsp), %r11
883 adcq 96(%rsp), %r12
884 adcq 104(%rsp), %r13
885 adcq 112(%rsp), %r14
886 adcq 120(%rsp), %r15
887 sbbq %rcx, %rcx
888
889 call __rsaz_512_subtract
890
891 leaq 128+24+48(%rsp), %rax
892 movq -48(%rax), %r15
893 movq -40(%rax), %r14
894 movq -32(%rax), %r13
895 movq -24(%rax), %r12
896 movq -16(%rax), %rbp
897 movq -8(%rax), %rbx
898 leaq (%rax), %rsp
899 .Lmul_epilogue:
900 ret
901 .size rsaz_512_mul,.-rsaz_512_mul
902 ___
903 }
904 {
905 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
906 $code.=<<___;
907 .globl rsaz_512_mul_gather4
908 .type rsaz_512_mul_gather4,\@function,6
909 .align 32
910 rsaz_512_mul_gather4:
911 push %rbx
912 push %rbp
913 push %r12
914 push %r13
915 push %r14
916 push %r15
917
918 mov $pwr, $pwr
919 subq \$128+24, %rsp
920 .Lmul_gather4_body:
921 ___
922 $code.=<<___ if ($addx);
923 movl \$0x80100,%r11d
924 andl OPENSSL_ia32cap_P+8(%rip),%r11d
925 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
926 je .Lmulx_gather
927 ___
928 $code.=<<___;
929 movl 64($bp,$pwr,4), %eax
930 movq $out, %xmm0 # off-load arguments
931 movl ($bp,$pwr,4), %ebx
932 movq $mod, %xmm1
933 movq $n0, 128(%rsp)
934
935 shlq \$32, %rax
936 or %rax, %rbx
937 movq ($ap), %rax
938 movq 8($ap), %rcx
939 leaq 128($bp,$pwr,4), %rbp
940 mulq %rbx # 0 iteration
941 movq %rax, (%rsp)
942 movq %rcx, %rax
943 movq %rdx, %r8
944
945 mulq %rbx
946 movd (%rbp), %xmm4
947 addq %rax, %r8
948 movq 16($ap), %rax
949 movq %rdx, %r9
950 adcq \$0, %r9
951
952 mulq %rbx
953 movd 64(%rbp), %xmm5
954 addq %rax, %r9
955 movq 24($ap), %rax
956 movq %rdx, %r10
957 adcq \$0, %r10
958
959 mulq %rbx
960 pslldq \$4, %xmm5
961 addq %rax, %r10
962 movq 32($ap), %rax
963 movq %rdx, %r11
964 adcq \$0, %r11
965
966 mulq %rbx
967 por %xmm5, %xmm4
968 addq %rax, %r11
969 movq 40($ap), %rax
970 movq %rdx, %r12
971 adcq \$0, %r12
972
973 mulq %rbx
974 addq %rax, %r12
975 movq 48($ap), %rax
976 movq %rdx, %r13
977 adcq \$0, %r13
978
979 mulq %rbx
980 leaq 128(%rbp), %rbp
981 addq %rax, %r13
982 movq 56($ap), %rax
983 movq %rdx, %r14
984 adcq \$0, %r14
985
986 mulq %rbx
987 movq %xmm4, %rbx
988 addq %rax, %r14
989 movq ($ap), %rax
990 movq %rdx, %r15
991 adcq \$0, %r15
992
993 leaq 8(%rsp), %rdi
994 movl \$7, %ecx
995 jmp .Loop_mul_gather
996
997 .align 32
998 .Loop_mul_gather:
999 mulq %rbx
1000 addq %rax, %r8
1001 movq 8($ap), %rax
1002 movq %r8, (%rdi)
1003 movq %rdx, %r8
1004 adcq \$0, %r8
1005
1006 mulq %rbx
1007 movd (%rbp), %xmm4
1008 addq %rax, %r9
1009 movq 16($ap), %rax
1010 adcq \$0, %rdx
1011 addq %r9, %r8
1012 movq %rdx, %r9
1013 adcq \$0, %r9
1014
1015 mulq %rbx
1016 movd 64(%rbp), %xmm5
1017 addq %rax, %r10
1018 movq 24($ap), %rax
1019 adcq \$0, %rdx
1020 addq %r10, %r9
1021 movq %rdx, %r10
1022 adcq \$0, %r10
1023
1024 mulq %rbx
1025 pslldq \$4, %xmm5
1026 addq %rax, %r11
1027 movq 32($ap), %rax
1028 adcq \$0, %rdx
1029 addq %r11, %r10
1030 movq %rdx, %r11
1031 adcq \$0, %r11
1032
1033 mulq %rbx
1034 por %xmm5, %xmm4
1035 addq %rax, %r12
1036 movq 40($ap), %rax
1037 adcq \$0, %rdx
1038 addq %r12, %r11
1039 movq %rdx, %r12
1040 adcq \$0, %r12
1041
1042 mulq %rbx
1043 addq %rax, %r13
1044 movq 48($ap), %rax
1045 adcq \$0, %rdx
1046 addq %r13, %r12
1047 movq %rdx, %r13
1048 adcq \$0, %r13
1049
1050 mulq %rbx
1051 addq %rax, %r14
1052 movq 56($ap), %rax
1053 adcq \$0, %rdx
1054 addq %r14, %r13
1055 movq %rdx, %r14
1056 adcq \$0, %r14
1057
1058 mulq %rbx
1059 movq %xmm4, %rbx
1060 addq %rax, %r15
1061 movq ($ap), %rax
1062 adcq \$0, %rdx
1063 addq %r15, %r14
1064 movq %rdx, %r15
1065 adcq \$0, %r15
1066
1067 leaq 128(%rbp), %rbp
1068 leaq 8(%rdi), %rdi
1069
1070 decl %ecx
1071 jnz .Loop_mul_gather
1072
1073 movq %r8, (%rdi)
1074 movq %r9, 8(%rdi)
1075 movq %r10, 16(%rdi)
1076 movq %r11, 24(%rdi)
1077 movq %r12, 32(%rdi)
1078 movq %r13, 40(%rdi)
1079 movq %r14, 48(%rdi)
1080 movq %r15, 56(%rdi)
1081
1082 movq %xmm0, $out
1083 movq %xmm1, %rbp
1084
1085 movq (%rsp), %r8
1086 movq 8(%rsp), %r9
1087 movq 16(%rsp), %r10
1088 movq 24(%rsp), %r11
1089 movq 32(%rsp), %r12
1090 movq 40(%rsp), %r13
1091 movq 48(%rsp), %r14
1092 movq 56(%rsp), %r15
1093
1094 call __rsaz_512_reduce
1095 ___
1096 $code.=<<___ if ($addx);
1097 jmp .Lmul_gather_tail
1098
1099 .align 32
1100 .Lmulx_gather:
1101 mov 64($bp,$pwr,4), %eax
1102 movq $out, %xmm0 # off-load arguments
1103 lea 128($bp,$pwr,4), %rbp
1104 mov ($bp,$pwr,4), %edx
1105 movq $mod, %xmm1
1106 mov $n0, 128(%rsp)
1107
1108 shl \$32, %rax
1109 or %rax, %rdx
1110 mulx ($ap), %rbx, %r8 # 0 iteration
1111 mov %rbx, (%rsp)
1112 xor %edi, %edi # cf=0, of=0
1113
1114 mulx 8($ap), %rax, %r9
1115 movd (%rbp), %xmm4
1116
1117 mulx 16($ap), %rbx, %r10
1118 movd 64(%rbp), %xmm5
1119 adcx %rax, %r8
1120
1121 mulx 24($ap), %rax, %r11
1122 pslldq \$4, %xmm5
1123 adcx %rbx, %r9
1124
1125 mulx 32($ap), %rbx, %r12
1126 por %xmm5, %xmm4
1127 adcx %rax, %r10
1128
1129 mulx 40($ap), %rax, %r13
1130 adcx %rbx, %r11
1131
1132 mulx 48($ap), %rbx, %r14
1133 lea 128(%rbp), %rbp
1134 adcx %rax, %r12
1135
1136 mulx 56($ap), %rax, %r15
1137 movq %xmm4, %rdx
1138 adcx %rbx, %r13
1139 adcx %rax, %r14
1140 mov %r8, %rbx
1141 adcx %rdi, %r15 # %rdi is 0
1142
1143 mov \$-7, %rcx
1144 jmp .Loop_mulx_gather
1145
1146 .align 32
1147 .Loop_mulx_gather:
1148 mulx ($ap), %rax, %r8
1149 adcx %rax, %rbx
1150 adox %r9, %r8
1151
1152 mulx 8($ap), %rax, %r9
1153 .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
1154 adcx %rax, %r8
1155 adox %r10, %r9
1156
1157 mulx 16($ap), %rax, %r10
1158 movd 64(%rbp), %xmm5
1159 lea 128(%rbp), %rbp
1160 adcx %rax, %r9
1161 adox %r11, %r10
1162
1163 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1164 pslldq \$4, %xmm5
1165 por %xmm5, %xmm4
1166 adcx %rax, %r10
1167 adox %r12, %r11
1168
1169 mulx 32($ap), %rax, %r12
1170 adcx %rax, %r11
1171 adox %r13, %r12
1172
1173 mulx 40($ap), %rax, %r13
1174 adcx %rax, %r12
1175 adox %r14, %r13
1176
1177 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1178 adcx %rax, %r13
1179 adox %r15, %r14
1180
1181 mulx 56($ap), %rax, %r15
1182 movq %xmm4, %rdx
1183 mov %rbx, 64(%rsp,%rcx,8)
1184 adcx %rax, %r14
1185 adox %rdi, %r15
1186 mov %r8, %rbx
1187 adcx %rdi, %r15 # cf=0
1188
1189 inc %rcx # of=0
1190 jnz .Loop_mulx_gather
1191
1192 mov %r8, 64(%rsp)
1193 mov %r9, 64+8(%rsp)
1194 mov %r10, 64+16(%rsp)
1195 mov %r11, 64+24(%rsp)
1196 mov %r12, 64+32(%rsp)
1197 mov %r13, 64+40(%rsp)
1198 mov %r14, 64+48(%rsp)
1199 mov %r15, 64+56(%rsp)
1200
1201 movq %xmm0, $out
1202 movq %xmm1, %rbp
1203
1204 mov 128(%rsp), %rdx # pull $n0
1205 mov (%rsp), %r8
1206 mov 8(%rsp), %r9
1207 mov 16(%rsp), %r10
1208 mov 24(%rsp), %r11
1209 mov 32(%rsp), %r12
1210 mov 40(%rsp), %r13
1211 mov 48(%rsp), %r14
1212 mov 56(%rsp), %r15
1213
1214 call __rsaz_512_reducex
1215
1216 .Lmul_gather_tail:
1217 ___
1218 $code.=<<___;
1219 addq 64(%rsp), %r8
1220 adcq 72(%rsp), %r9
1221 adcq 80(%rsp), %r10
1222 adcq 88(%rsp), %r11
1223 adcq 96(%rsp), %r12
1224 adcq 104(%rsp), %r13
1225 adcq 112(%rsp), %r14
1226 adcq 120(%rsp), %r15
1227 sbbq %rcx, %rcx
1228
1229 call __rsaz_512_subtract
1230
1231 leaq 128+24+48(%rsp), %rax
1232 movq -48(%rax), %r15
1233 movq -40(%rax), %r14
1234 movq -32(%rax), %r13
1235 movq -24(%rax), %r12
1236 movq -16(%rax), %rbp
1237 movq -8(%rax), %rbx
1238 leaq (%rax), %rsp
1239 .Lmul_gather4_epilogue:
1240 ret
1241 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1242 ___
1243 }
1244 {
1245 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1246 $code.=<<___;
1247 .globl rsaz_512_mul_scatter4
1248 .type rsaz_512_mul_scatter4,\@function,6
1249 .align 32
1250 rsaz_512_mul_scatter4:
1251 push %rbx
1252 push %rbp
1253 push %r12
1254 push %r13
1255 push %r14
1256 push %r15
1257
1258 mov $pwr, $pwr
1259 subq \$128+24, %rsp
1260 .Lmul_scatter4_body:
1261 leaq ($tbl,$pwr,4), $tbl
1262 movq $out, %xmm0 # off-load arguments
1263 movq $mod, %xmm1
1264 movq $tbl, %xmm2
1265 movq $n0, 128(%rsp)
1266
1267 movq $out, %rbp
1268 ___
1269 $code.=<<___ if ($addx);
1270 movl \$0x80100,%r11d
1271 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1272 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1273 je .Lmulx_scatter
1274 ___
1275 $code.=<<___;
1276 movq ($out),%rbx # pass b[0]
1277 call __rsaz_512_mul
1278
1279 movq %xmm0, $out
1280 movq %xmm1, %rbp
1281
1282 movq (%rsp), %r8
1283 movq 8(%rsp), %r9
1284 movq 16(%rsp), %r10
1285 movq 24(%rsp), %r11
1286 movq 32(%rsp), %r12
1287 movq 40(%rsp), %r13
1288 movq 48(%rsp), %r14
1289 movq 56(%rsp), %r15
1290
1291 call __rsaz_512_reduce
1292 ___
1293 $code.=<<___ if ($addx);
1294 jmp .Lmul_scatter_tail
1295
1296 .align 32
1297 .Lmulx_scatter:
1298 movq ($out), %rdx # pass b[0]
1299 call __rsaz_512_mulx
1300
1301 movq %xmm0, $out
1302 movq %xmm1, %rbp
1303
1304 movq 128(%rsp), %rdx # pull $n0
1305 movq (%rsp), %r8
1306 movq 8(%rsp), %r9
1307 movq 16(%rsp), %r10
1308 movq 24(%rsp), %r11
1309 movq 32(%rsp), %r12
1310 movq 40(%rsp), %r13
1311 movq 48(%rsp), %r14
1312 movq 56(%rsp), %r15
1313
1314 call __rsaz_512_reducex
1315
1316 .Lmul_scatter_tail:
1317 ___
1318 $code.=<<___;
1319 addq 64(%rsp), %r8
1320 adcq 72(%rsp), %r9
1321 adcq 80(%rsp), %r10
1322 adcq 88(%rsp), %r11
1323 adcq 96(%rsp), %r12
1324 adcq 104(%rsp), %r13
1325 adcq 112(%rsp), %r14
1326 adcq 120(%rsp), %r15
1327 movq %xmm2, $inp
1328 sbbq %rcx, %rcx
1329
1330 call __rsaz_512_subtract
1331
1332 movl %r8d, 64*0($inp) # scatter
1333 shrq \$32, %r8
1334 movl %r9d, 64*2($inp)
1335 shrq \$32, %r9
1336 movl %r10d, 64*4($inp)
1337 shrq \$32, %r10
1338 movl %r11d, 64*6($inp)
1339 shrq \$32, %r11
1340 movl %r12d, 64*8($inp)
1341 shrq \$32, %r12
1342 movl %r13d, 64*10($inp)
1343 shrq \$32, %r13
1344 movl %r14d, 64*12($inp)
1345 shrq \$32, %r14
1346 movl %r15d, 64*14($inp)
1347 shrq \$32, %r15
1348 movl %r8d, 64*1($inp)
1349 movl %r9d, 64*3($inp)
1350 movl %r10d, 64*5($inp)
1351 movl %r11d, 64*7($inp)
1352 movl %r12d, 64*9($inp)
1353 movl %r13d, 64*11($inp)
1354 movl %r14d, 64*13($inp)
1355 movl %r15d, 64*15($inp)
1356
1357 leaq 128+24+48(%rsp), %rax
1358 movq -48(%rax), %r15
1359 movq -40(%rax), %r14
1360 movq -32(%rax), %r13
1361 movq -24(%rax), %r12
1362 movq -16(%rax), %rbp
1363 movq -8(%rax), %rbx
1364 leaq (%rax), %rsp
1365 .Lmul_scatter4_epilogue:
1366 ret
1367 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1368 ___
1369 }
1370 {
1371 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1372 $code.=<<___;
1373 .globl rsaz_512_mul_by_one
1374 .type rsaz_512_mul_by_one,\@function,4
1375 .align 32
1376 rsaz_512_mul_by_one:
1377 push %rbx
1378 push %rbp
1379 push %r12
1380 push %r13
1381 push %r14
1382 push %r15
1383
1384 subq \$128+24, %rsp
1385 .Lmul_by_one_body:
1386 ___
1387 $code.=<<___ if ($addx);
1388 movl OPENSSL_ia32cap_P+8(%rip),%eax
1389 ___
1390 $code.=<<___;
1391 movq $mod, %rbp # reassign argument
1392 movq $n0, 128(%rsp)
1393
1394 movq ($inp), %r8
1395 pxor %xmm0, %xmm0
1396 movq 8($inp), %r9
1397 movq 16($inp), %r10
1398 movq 24($inp), %r11
1399 movq 32($inp), %r12
1400 movq 40($inp), %r13
1401 movq 48($inp), %r14
1402 movq 56($inp), %r15
1403
1404 movdqa %xmm0, (%rsp)
1405 movdqa %xmm0, 16(%rsp)
1406 movdqa %xmm0, 32(%rsp)
1407 movdqa %xmm0, 48(%rsp)
1408 movdqa %xmm0, 64(%rsp)
1409 movdqa %xmm0, 80(%rsp)
1410 movdqa %xmm0, 96(%rsp)
1411 ___
1412 $code.=<<___ if ($addx);
1413 andl \$0x80100,%eax
1414 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1415 je .Lby_one_callx
1416 ___
1417 $code.=<<___;
1418 call __rsaz_512_reduce
1419 ___
1420 $code.=<<___ if ($addx);
1421 jmp .Lby_one_tail
1422 .align 32
1423 .Lby_one_callx:
1424 movq 128(%rsp), %rdx # pull $n0
1425 call __rsaz_512_reducex
1426 .Lby_one_tail:
1427 ___
1428 $code.=<<___;
1429 movq %r8, ($out)
1430 movq %r9, 8($out)
1431 movq %r10, 16($out)
1432 movq %r11, 24($out)
1433 movq %r12, 32($out)
1434 movq %r13, 40($out)
1435 movq %r14, 48($out)
1436 movq %r15, 56($out)
1437
1438 leaq 128+24+48(%rsp), %rax
1439 movq -48(%rax), %r15
1440 movq -40(%rax), %r14
1441 movq -32(%rax), %r13
1442 movq -24(%rax), %r12
1443 movq -16(%rax), %rbp
1444 movq -8(%rax), %rbx
1445 leaq (%rax), %rsp
1446 .Lmul_by_one_epilogue:
1447 ret
1448 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1449 ___
1450 }
1451 { # __rsaz_512_reduce
1452 #
1453 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1454 # output: %r8-%r15
1455 # clobbers: everything except %rbp and %rdi
1456 $code.=<<___;
1457 .type __rsaz_512_reduce,\@abi-omnipotent
1458 .align 32
1459 __rsaz_512_reduce:
1460 movq %r8, %rbx
1461 imulq 128+8(%rsp), %rbx
1462 movq 0(%rbp), %rax
1463 movl \$8, %ecx
1464 jmp .Lreduction_loop
1465
1466 .align 32
1467 .Lreduction_loop:
1468 mulq %rbx
1469 movq 8(%rbp), %rax
1470 negq %r8
1471 movq %rdx, %r8
1472 adcq \$0, %r8
1473
1474 mulq %rbx
1475 addq %rax, %r9
1476 movq 16(%rbp), %rax
1477 adcq \$0, %rdx
1478 addq %r9, %r8
1479 movq %rdx, %r9
1480 adcq \$0, %r9
1481
1482 mulq %rbx
1483 addq %rax, %r10
1484 movq 24(%rbp), %rax
1485 adcq \$0, %rdx
1486 addq %r10, %r9
1487 movq %rdx, %r10
1488 adcq \$0, %r10
1489
1490 mulq %rbx
1491 addq %rax, %r11
1492 movq 32(%rbp), %rax
1493 adcq \$0, %rdx
1494 addq %r11, %r10
1495 movq 128+8(%rsp), %rsi
1496 #movq %rdx, %r11
1497 #adcq \$0, %r11
1498 adcq \$0, %rdx
1499 movq %rdx, %r11
1500
1501 mulq %rbx
1502 addq %rax, %r12
1503 movq 40(%rbp), %rax
1504 adcq \$0, %rdx
1505 imulq %r8, %rsi
1506 addq %r12, %r11
1507 movq %rdx, %r12
1508 adcq \$0, %r12
1509
1510 mulq %rbx
1511 addq %rax, %r13
1512 movq 48(%rbp), %rax
1513 adcq \$0, %rdx
1514 addq %r13, %r12
1515 movq %rdx, %r13
1516 adcq \$0, %r13
1517
1518 mulq %rbx
1519 addq %rax, %r14
1520 movq 56(%rbp), %rax
1521 adcq \$0, %rdx
1522 addq %r14, %r13
1523 movq %rdx, %r14
1524 adcq \$0, %r14
1525
1526 mulq %rbx
1527 movq %rsi, %rbx
1528 addq %rax, %r15
1529 movq 0(%rbp), %rax
1530 adcq \$0, %rdx
1531 addq %r15, %r14
1532 movq %rdx, %r15
1533 adcq \$0, %r15
1534
1535 decl %ecx
1536 jne .Lreduction_loop
1537
1538 ret
1539 .size __rsaz_512_reduce,.-__rsaz_512_reduce
1540 ___
1541 }
1542 if ($addx) {
1543 # __rsaz_512_reducex
1544 #
1545 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1546 # output: %r8-%r15
1547 # clobbers: everything except %rbp and %rdi
1548 $code.=<<___;
1549 .type __rsaz_512_reducex,\@abi-omnipotent
1550 .align 32
1551 __rsaz_512_reducex:
1552 #movq 128+8(%rsp), %rdx # pull $n0
1553 imulq %r8, %rdx
1554 xorq %rsi, %rsi # cf=0,of=0
1555 movl \$8, %ecx
1556 jmp .Lreduction_loopx
1557
1558 .align 32
1559 .Lreduction_loopx:
1560 mov %r8, %rbx
1561 mulx 0(%rbp), %rax, %r8
1562 adcx %rbx, %rax
1563 adox %r9, %r8
1564
1565 mulx 8(%rbp), %rax, %r9
1566 adcx %rax, %r8
1567 adox %r10, %r9
1568
1569 mulx 16(%rbp), %rbx, %r10
1570 adcx %rbx, %r9
1571 adox %r11, %r10
1572
1573 mulx 24(%rbp), %rbx, %r11
1574 adcx %rbx, %r10
1575 adox %r12, %r11
1576
1577 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1578 mov %rdx, %rax
1579 mov %r8, %rdx
1580 adcx %rbx, %r11
1581 adox %r13, %r12
1582
1583 mulx 128+8(%rsp), %rbx, %rdx
1584 mov %rax, %rdx
1585
1586 mulx 40(%rbp), %rax, %r13
1587 adcx %rax, %r12
1588 adox %r14, %r13
1589
1590 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1591 adcx %rax, %r13
1592 adox %r15, %r14
1593
1594 mulx 56(%rbp), %rax, %r15
1595 mov %rbx, %rdx
1596 adcx %rax, %r14
1597 adox %rsi, %r15 # %rsi is 0
1598 adcx %rsi, %r15 # cf=0
1599
1600 decl %ecx # of=0
1601 jne .Lreduction_loopx
1602
1603 ret
1604 .size __rsaz_512_reducex,.-__rsaz_512_reducex
1605 ___
1606 }
1607 { # __rsaz_512_subtract
1608 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1609 # output:
1610 # clobbers: everything but %rdi, %rsi and %rbp
1611 $code.=<<___;
1612 .type __rsaz_512_subtract,\@abi-omnipotent
1613 .align 32
1614 __rsaz_512_subtract:
1615 movq %r8, ($out)
1616 movq %r9, 8($out)
1617 movq %r10, 16($out)
1618 movq %r11, 24($out)
1619 movq %r12, 32($out)
1620 movq %r13, 40($out)
1621 movq %r14, 48($out)
1622 movq %r15, 56($out)
1623
1624 movq 0($mod), %r8
1625 movq 8($mod), %r9
1626 negq %r8
1627 notq %r9
1628 andq %rcx, %r8
1629 movq 16($mod), %r10
1630 andq %rcx, %r9
1631 notq %r10
1632 movq 24($mod), %r11
1633 andq %rcx, %r10
1634 notq %r11
1635 movq 32($mod), %r12
1636 andq %rcx, %r11
1637 notq %r12
1638 movq 40($mod), %r13
1639 andq %rcx, %r12
1640 notq %r13
1641 movq 48($mod), %r14
1642 andq %rcx, %r13
1643 notq %r14
1644 movq 56($mod), %r15
1645 andq %rcx, %r14
1646 notq %r15
1647 andq %rcx, %r15
1648
1649 addq ($out), %r8
1650 adcq 8($out), %r9
1651 adcq 16($out), %r10
1652 adcq 24($out), %r11
1653 adcq 32($out), %r12
1654 adcq 40($out), %r13
1655 adcq 48($out), %r14
1656 adcq 56($out), %r15
1657
1658 movq %r8, ($out)
1659 movq %r9, 8($out)
1660 movq %r10, 16($out)
1661 movq %r11, 24($out)
1662 movq %r12, 32($out)
1663 movq %r13, 40($out)
1664 movq %r14, 48($out)
1665 movq %r15, 56($out)
1666
1667 ret
1668 .size __rsaz_512_subtract,.-__rsaz_512_subtract
1669 ___
1670 }
1671 { # __rsaz_512_mul
1672 #
1673 # input: %rsi - ap, %rbp - bp
1674 # ouput:
1675 # clobbers: everything
1676 my ($ap,$bp) = ("%rsi","%rbp");
1677 $code.=<<___;
1678 .type __rsaz_512_mul,\@abi-omnipotent
1679 .align 32
1680 __rsaz_512_mul:
1681 leaq 8(%rsp), %rdi
1682
1683 movq ($ap), %rax
1684 mulq %rbx
1685 movq %rax, (%rdi)
1686 movq 8($ap), %rax
1687 movq %rdx, %r8
1688
1689 mulq %rbx
1690 addq %rax, %r8
1691 movq 16($ap), %rax
1692 movq %rdx, %r9
1693 adcq \$0, %r9
1694
1695 mulq %rbx
1696 addq %rax, %r9
1697 movq 24($ap), %rax
1698 movq %rdx, %r10
1699 adcq \$0, %r10
1700
1701 mulq %rbx
1702 addq %rax, %r10
1703 movq 32($ap), %rax
1704 movq %rdx, %r11
1705 adcq \$0, %r11
1706
1707 mulq %rbx
1708 addq %rax, %r11
1709 movq 40($ap), %rax
1710 movq %rdx, %r12
1711 adcq \$0, %r12
1712
1713 mulq %rbx
1714 addq %rax, %r12
1715 movq 48($ap), %rax
1716 movq %rdx, %r13
1717 adcq \$0, %r13
1718
1719 mulq %rbx
1720 addq %rax, %r13
1721 movq 56($ap), %rax
1722 movq %rdx, %r14
1723 adcq \$0, %r14
1724
1725 mulq %rbx
1726 addq %rax, %r14
1727 movq ($ap), %rax
1728 movq %rdx, %r15
1729 adcq \$0, %r15
1730
1731 leaq 8($bp), $bp
1732 leaq 8(%rdi), %rdi
1733
1734 movl \$7, %ecx
1735 jmp .Loop_mul
1736
1737 .align 32
1738 .Loop_mul:
1739 movq ($bp), %rbx
1740 mulq %rbx
1741 addq %rax, %r8
1742 movq 8($ap), %rax
1743 movq %r8, (%rdi)
1744 movq %rdx, %r8
1745 adcq \$0, %r8
1746
1747 mulq %rbx
1748 addq %rax, %r9
1749 movq 16($ap), %rax
1750 adcq \$0, %rdx
1751 addq %r9, %r8
1752 movq %rdx, %r9
1753 adcq \$0, %r9
1754
1755 mulq %rbx
1756 addq %rax, %r10
1757 movq 24($ap), %rax
1758 adcq \$0, %rdx
1759 addq %r10, %r9
1760 movq %rdx, %r10
1761 adcq \$0, %r10
1762
1763 mulq %rbx
1764 addq %rax, %r11
1765 movq 32($ap), %rax
1766 adcq \$0, %rdx
1767 addq %r11, %r10
1768 movq %rdx, %r11
1769 adcq \$0, %r11
1770
1771 mulq %rbx
1772 addq %rax, %r12
1773 movq 40($ap), %rax
1774 adcq \$0, %rdx
1775 addq %r12, %r11
1776 movq %rdx, %r12
1777 adcq \$0, %r12
1778
1779 mulq %rbx
1780 addq %rax, %r13
1781 movq 48($ap), %rax
1782 adcq \$0, %rdx
1783 addq %r13, %r12
1784 movq %rdx, %r13
1785 adcq \$0, %r13
1786
1787 mulq %rbx
1788 addq %rax, %r14
1789 movq 56($ap), %rax
1790 adcq \$0, %rdx
1791 addq %r14, %r13
1792 movq %rdx, %r14
1793 leaq 8($bp), $bp
1794 adcq \$0, %r14
1795
1796 mulq %rbx
1797 addq %rax, %r15
1798 movq ($ap), %rax
1799 adcq \$0, %rdx
1800 addq %r15, %r14
1801 movq %rdx, %r15
1802 adcq \$0, %r15
1803
1804 leaq 8(%rdi), %rdi
1805
1806 decl %ecx
1807 jnz .Loop_mul
1808
1809 movq %r8, (%rdi)
1810 movq %r9, 8(%rdi)
1811 movq %r10, 16(%rdi)
1812 movq %r11, 24(%rdi)
1813 movq %r12, 32(%rdi)
1814 movq %r13, 40(%rdi)
1815 movq %r14, 48(%rdi)
1816 movq %r15, 56(%rdi)
1817
1818 ret
1819 .size __rsaz_512_mul,.-__rsaz_512_mul
1820 ___
1821 }
1822 if ($addx) {
1823 # __rsaz_512_mulx
1824 #
1825 # input: %rsi - ap, %rbp - bp
1826 # ouput:
1827 # clobbers: everything
1828 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1829 $code.=<<___;
1830 .type __rsaz_512_mulx,\@abi-omnipotent
1831 .align 32
1832 __rsaz_512_mulx:
1833 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
1834 mov \$-6, %rcx
1835
1836 mulx 8($ap), %rax, %r9
1837 movq %rbx, 8(%rsp)
1838
1839 mulx 16($ap), %rbx, %r10
1840 adc %rax, %r8
1841
1842 mulx 24($ap), %rax, %r11
1843 adc %rbx, %r9
1844
1845 mulx 32($ap), %rbx, %r12
1846 adc %rax, %r10
1847
1848 mulx 40($ap), %rax, %r13
1849 adc %rbx, %r11
1850
1851 mulx 48($ap), %rbx, %r14
1852 adc %rax, %r12
1853
1854 mulx 56($ap), %rax, %r15
1855 mov 8($bp), %rdx
1856 adc %rbx, %r13
1857 adc %rax, %r14
1858 adc \$0, %r15
1859
1860 xor $zero, $zero # cf=0,of=0
1861 jmp .Loop_mulx
1862
1863 .align 32
1864 .Loop_mulx:
1865 movq %r8, %rbx
1866 mulx ($ap), %rax, %r8
1867 adcx %rax, %rbx
1868 adox %r9, %r8
1869
1870 mulx 8($ap), %rax, %r9
1871 adcx %rax, %r8
1872 adox %r10, %r9
1873
1874 mulx 16($ap), %rax, %r10
1875 adcx %rax, %r9
1876 adox %r11, %r10
1877
1878 mulx 24($ap), %rax, %r11
1879 adcx %rax, %r10
1880 adox %r12, %r11
1881
1882 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1883 adcx %rax, %r11
1884 adox %r13, %r12
1885
1886 mulx 40($ap), %rax, %r13
1887 adcx %rax, %r12
1888 adox %r14, %r13
1889
1890 mulx 48($ap), %rax, %r14
1891 adcx %rax, %r13
1892 adox %r15, %r14
1893
1894 mulx 56($ap), %rax, %r15
1895 movq 64($bp,%rcx,8), %rdx
1896 movq %rbx, 8+64-8(%rsp,%rcx,8)
1897 adcx %rax, %r14
1898 adox $zero, %r15
1899 adcx $zero, %r15 # cf=0
1900
1901 inc %rcx # of=0
1902 jnz .Loop_mulx
1903
1904 movq %r8, %rbx
1905 mulx ($ap), %rax, %r8
1906 adcx %rax, %rbx
1907 adox %r9, %r8
1908
1909 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
1910 adcx %rax, %r8
1911 adox %r10, %r9
1912
1913 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
1914 adcx %rax, %r9
1915 adox %r11, %r10
1916
1917 mulx 24($ap), %rax, %r11
1918 adcx %rax, %r10
1919 adox %r12, %r11
1920
1921 mulx 32($ap), %rax, %r12
1922 adcx %rax, %r11
1923 adox %r13, %r12
1924
1925 mulx 40($ap), %rax, %r13
1926 adcx %rax, %r12
1927 adox %r14, %r13
1928
1929 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1930 adcx %rax, %r13
1931 adox %r15, %r14
1932
1933 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
1934 adcx %rax, %r14
1935 adox $zero, %r15
1936 adcx $zero, %r15
1937
1938 mov %rbx, 8+64-8(%rsp)
1939 mov %r8, 8+64(%rsp)
1940 mov %r9, 8+64+8(%rsp)
1941 mov %r10, 8+64+16(%rsp)
1942 mov %r11, 8+64+24(%rsp)
1943 mov %r12, 8+64+32(%rsp)
1944 mov %r13, 8+64+40(%rsp)
1945 mov %r14, 8+64+48(%rsp)
1946 mov %r15, 8+64+56(%rsp)
1947
1948 ret
1949 .size __rsaz_512_mulx,.-__rsaz_512_mulx
1950 ___
1951 }
1952 {
1953 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1954 $code.=<<___;
1955 .globl rsaz_512_scatter4
1956 .type rsaz_512_scatter4,\@abi-omnipotent
1957 .align 16
1958 rsaz_512_scatter4:
1959 leaq ($out,$power,4), $out
1960 movl \$8, %r9d
1961 jmp .Loop_scatter
1962 .align 16
1963 .Loop_scatter:
1964 movq ($inp), %rax
1965 leaq 8($inp), $inp
1966 movl %eax, ($out)
1967 shrq \$32, %rax
1968 movl %eax, 64($out)
1969 leaq 128($out), $out
1970 decl %r9d
1971 jnz .Loop_scatter
1972 ret
1973 .size rsaz_512_scatter4,.-rsaz_512_scatter4
1974
1975 .globl rsaz_512_gather4
1976 .type rsaz_512_gather4,\@abi-omnipotent
1977 .align 16
1978 rsaz_512_gather4:
1979 leaq ($inp,$power,4), $inp
1980 movl \$8, %r9d
1981 jmp .Loop_gather
1982 .align 16
1983 .Loop_gather:
1984 movl ($inp), %eax
1985 movl 64($inp), %r8d
1986 leaq 128($inp), $inp
1987 shlq \$32, %r8
1988 or %r8, %rax
1989 movq %rax, ($out)
1990 leaq 8($out), $out
1991 decl %r9d
1992 jnz .Loop_gather
1993 ret
1994 .size rsaz_512_gather4,.-rsaz_512_gather4
1995 ___
1996 }
1997
1998 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1999 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2000 if ($win64) {
2001 $rec="%rcx";
2002 $frame="%rdx";
2003 $context="%r8";
2004 $disp="%r9";
2005
2006 $code.=<<___;
2007 .extern __imp_RtlVirtualUnwind
2008 .type se_handler,\@abi-omnipotent
2009 .align 16
2010 se_handler:
2011 push %rsi
2012 push %rdi
2013 push %rbx
2014 push %rbp
2015 push %r12
2016 push %r13
2017 push %r14
2018 push %r15
2019 pushfq
2020 sub \$64,%rsp
2021
2022 mov 120($context),%rax # pull context->Rax
2023 mov 248($context),%rbx # pull context->Rip
2024
2025 mov 8($disp),%rsi # disp->ImageBase
2026 mov 56($disp),%r11 # disp->HandlerData
2027
2028 mov 0(%r11),%r10d # HandlerData[0]
2029 lea (%rsi,%r10),%r10 # end of prologue label
2030 cmp %r10,%rbx # context->Rip<end of prologue label
2031 jb .Lcommon_seh_tail
2032
2033 mov 152($context),%rax # pull context->Rsp
2034
2035 mov 4(%r11),%r10d # HandlerData[1]
2036 lea (%rsi,%r10),%r10 # epilogue label
2037 cmp %r10,%rbx # context->Rip>=epilogue label
2038 jae .Lcommon_seh_tail
2039
2040 lea 128+24+48(%rax),%rax
2041
2042 mov -8(%rax),%rbx
2043 mov -16(%rax),%rbp
2044 mov -24(%rax),%r12
2045 mov -32(%rax),%r13
2046 mov -40(%rax),%r14
2047 mov -48(%rax),%r15
2048 mov %rbx,144($context) # restore context->Rbx
2049 mov %rbp,160($context) # restore context->Rbp
2050 mov %r12,216($context) # restore context->R12
2051 mov %r13,224($context) # restore context->R13
2052 mov %r14,232($context) # restore context->R14
2053 mov %r15,240($context) # restore context->R15
2054
2055 .Lcommon_seh_tail:
2056 mov 8(%rax),%rdi
2057 mov 16(%rax),%rsi
2058 mov %rax,152($context) # restore context->Rsp
2059 mov %rsi,168($context) # restore context->Rsi
2060 mov %rdi,176($context) # restore context->Rdi
2061
2062 mov 40($disp),%rdi # disp->ContextRecord
2063 mov $context,%rsi # context
2064 mov \$154,%ecx # sizeof(CONTEXT)
2065 .long 0xa548f3fc # cld; rep movsq
2066
2067 mov $disp,%rsi
2068 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2069 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2070 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2071 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2072 mov 40(%rsi),%r10 # disp->ContextRecord
2073 lea 56(%rsi),%r11 # &disp->HandlerData
2074 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2075 mov %r10,32(%rsp) # arg5
2076 mov %r11,40(%rsp) # arg6
2077 mov %r12,48(%rsp) # arg7
2078 mov %rcx,56(%rsp) # arg8, (NULL)
2079 call *__imp_RtlVirtualUnwind(%rip)
2080
2081 mov \$1,%eax # ExceptionContinueSearch
2082 add \$64,%rsp
2083 popfq
2084 pop %r15
2085 pop %r14
2086 pop %r13
2087 pop %r12
2088 pop %rbp
2089 pop %rbx
2090 pop %rdi
2091 pop %rsi
2092 ret
2093 .size sqr_handler,.-sqr_handler
2094
2095 .section .pdata
2096 .align 4
2097 .rva .LSEH_begin_rsaz_512_sqr
2098 .rva .LSEH_end_rsaz_512_sqr
2099 .rva .LSEH_info_rsaz_512_sqr
2100
2101 .rva .LSEH_begin_rsaz_512_mul
2102 .rva .LSEH_end_rsaz_512_mul
2103 .rva .LSEH_info_rsaz_512_mul
2104
2105 .rva .LSEH_begin_rsaz_512_mul_gather4
2106 .rva .LSEH_end_rsaz_512_mul_gather4
2107 .rva .LSEH_info_rsaz_512_mul_gather4
2108
2109 .rva .LSEH_begin_rsaz_512_mul_scatter4
2110 .rva .LSEH_end_rsaz_512_mul_scatter4
2111 .rva .LSEH_info_rsaz_512_mul_scatter4
2112
2113 .rva .LSEH_begin_rsaz_512_mul_by_one
2114 .rva .LSEH_end_rsaz_512_mul_by_one
2115 .rva .LSEH_info_rsaz_512_mul_by_one
2116
2117 .section .xdata
2118 .align 8
2119 .LSEH_info_rsaz_512_sqr:
2120 .byte 9,0,0,0
2121 .rva se_handler
2122 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2123 .LSEH_info_rsaz_512_mul:
2124 .byte 9,0,0,0
2125 .rva se_handler
2126 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2127 .LSEH_info_rsaz_512_mul_gather4:
2128 .byte 9,0,0,0
2129 .rva se_handler
2130 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2131 .LSEH_info_rsaz_512_mul_scatter4:
2132 .byte 9,0,0,0
2133 .rva se_handler
2134 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2135 .LSEH_info_rsaz_512_mul_by_one:
2136 .byte 9,0,0,0
2137 .rva se_handler
2138 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2139 ___
2140 }
2141
2142 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2143 print $code;
2144 close STDOUT;