3 #******************************************************************************#
4 #* Copyright(c) 2012, Intel Corp. *#
5 #* Developers and authors: *#
6 #* Shay Gueron (1, 2), and Vlad Krasnov (1) *#
7 #* (1) Intel Architecture Group, Microprocessor and Chipset Development, *#
8 #* Israel Development Center, Haifa, Israel *#
9 #* (2) University of Haifa *#
10 #******************************************************************************#
11 #* This submission to OpenSSL is to be made available under the OpenSSL *#
12 #* license, and only to the OpenSSL project, in order to allow integration *#
13 #* into the publicly distributed code. ? *#
14 #* The use of this code, or portions of this code, or concepts embedded in *#
15 #* this code, or modification of this code and/or algorithm(s) in it, or the *#
16 #* use of this code for any other purpose than stated above, requires special *#
18 #******************************************************************************#
19 #******************************************************************************#
21 #* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS *#
22 #* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *#
23 #* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *#
24 #* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT*#
25 #* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, *#
26 #* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF *#
27 #* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS *#
28 #* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN *#
29 #* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) *#
30 #* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *#
31 #* POSSIBILITY OF SUCH DAMAGE. *#
32 #******************************************************************************#
34 #* [1] S. Gueron, "Efficient Software Implementations of Modular *#
35 #* Exponentiation", http://eprint.iacr.org/2011/239 *#
36 #* [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". *#
37 #* IEEE Proceedings of 9th International Conference on Information *#
38 #* Technology: New Generations (ITNG 2012), 821-823 (2012). *#
39 #* [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation*#
40 #* Journal of Cryptographic Engineering 2:31-43 (2012). *#
41 #* [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis *#
42 #* resistant 512-bit and 1024-bit modular exponentiation for optimizing *#
43 #* RSA1024 and RSA2048 on x86_64 platforms", *#
44 #* http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest*#
45 ################################################################################
47 # While original submission covers 512- and 1024-bit exponentiation,
48 # this module is limited to 512-bit version only (and as such
49 # accelerates RSA1024 sign). This is because improvement for longer
50 # keys is not high enough to justify the effort, highest measured
51 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
52 # for the moment of this writing!] Nor does this module implement
53 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
54 # to more modular mixture of C and assembly. And it's optimized even
55 # for processors other than Intel Core family (see table below for
56 # improvement coefficients).
59 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
60 # ----------------+---------------------------
61 # Opteron +13% |+5% +20%
62 # Bulldozer -0% |-1% +10%
64 # Westmere +5% |+14% +17%
65 # Sandy Bridge +2% |+12% +29%
66 # Ivy Bridge +1% |+11% +35%
67 # Haswell(**) -0% |+12% +39%
69 # VIA Nano +70% |+9% +25%
71 # (*) rsax engine and fips numbers are presented for reference
73 # (**) MULX was attempted, but found to give only marginal improvement;
77 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
79 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
81 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
82 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
83 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f
$xlate) or
84 die "can't locate x86_64-xlate.pl";
86 open OUT
,"| $^X $xlate $flavour $output";
89 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
90 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
94 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM
} =~ /nasm/) &&
95 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
99 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM
} =~ /ml64/) &&
100 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
104 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
106 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
111 .extern OPENSSL_ia32cap_P
114 .type rsaz_512_sqr
,\
@function,4
116 rsaz_512_sqr
: # 25-29% faster than rsaz_512_mul
126 movq
$mod, %rbp # common argument
131 $code.=<<___
if ($addx);
133 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
134 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
142 movl
$times,128+8(%rsp)
186 addq
%r8, %r8 #shlq \$1, %r8
188 adcq
%r9, %r9 #shld \$1, %r8, %r9
249 lea
(%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
251 adcq
%r11, %r11 #shld \$1, %r10, %r11
289 lea
(%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
307 leaq
(%r10,%r13,2), %r13 #shld \$1, %r12, %r13
337 leaq
(%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
355 leaq
(%r12,%r15,2),%r15 #shld \$1, %r14, %r15
380 leaq
(%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
395 leaq
(%r12,%r9,2), %r9 #shld \$1, %r8, %r9
419 leaq
(%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
427 leaq
(%r15,%r11,2), %r11 #shld \$1, %r10, %r11
448 adcq
%r12, %r12 #shld \$1, %rbx, %r12
449 adcq
%r13, %r13 #shld \$1, %r12, %r13
450 adcq
%r14, %r14 #shld \$1, %r13, %r14
480 call __rsaz_512_reduce
492 call __rsaz_512_subtract
496 movl
128+8(%rsp), $times
508 movl
$times,128+8(%rsp)
509 movq
$out, %xmm0 # off-load
510 movq
%rbp, %xmm1 # off-load
514 mulx
16($inp), %rcx, %r10
515 xor %rbp, %rbp # cf=0, of=0
517 mulx
24($inp), %rax, %r11
520 mulx
32($inp), %rcx, %r12
523 mulx
40($inp), %rax, %r13
526 .byte
0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
530 .byte
0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
532 adcx
%rbp, %r15 # %rbp is 0
539 mulx
%rdx, %rax, %rdx
548 mulx
16($inp), %rax, %rbx
552 .byte
0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
556 mulx
32($inp), %rax, %rbx
560 mulx
40($inp), $out, %r8
564 .byte
0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
568 .byte
0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
578 mulx
%rdx, %rax, %rcx
585 .byte
0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
588 .byte
0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
592 mulx
32($inp), %rax, %rcx
596 mulx
40($inp), $out, %r9
600 .byte
0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
604 .byte
0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
614 mulx
%rdx, %rax, %rdx
621 .byte
0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
624 .byte
0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
628 mulx
40($inp), $out, %r10
632 mulx
48($inp), %rax, %rbx
636 mulx
56($inp), $out, %r10
647 mulx
%rdx, %rax, %rdx
657 .byte
0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
661 mulx
48($inp), %rax, %rcx
665 mulx
56($inp), $out, %r11
675 mulx
%rdx, %rax, %rdx
685 .byte
0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
689 .byte
0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
699 mulx
%rdx, %rax, %rdx
709 .byte
0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
719 mulx
%rdx, %rax, %rdx
725 .byte
0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
726 .byte
0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
729 mulx
%rdx, %rax, %rdx
741 movq
128(%rsp), %rdx # pull $n0
751 call __rsaz_512_reducex
763 call __rsaz_512_subtract
767 movl
128+8(%rsp), $times
778 leaq
128+24+48(%rsp), %rax
788 .size rsaz_512_sqr
,.-rsaz_512_sqr
792 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
795 .type rsaz_512_mul
,\
@function,5
807 movq
$out, %xmm0 # off-load arguments
811 $code.=<<___
if ($addx);
813 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
814 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
818 movq
($bp), %rbx # pass b[0]
819 movq
$bp, %rbp # pass argument
834 call __rsaz_512_reduce
836 $code.=<<___
if ($addx);
841 movq
$bp, %rbp # pass argument
842 movq
($bp), %rdx # pass b[0]
848 movq
128(%rsp), %rdx # pull $n0
858 call __rsaz_512_reducex
872 call __rsaz_512_subtract
874 leaq
128+24+48(%rsp), %rax
884 .size rsaz_512_mul
,.-rsaz_512_mul
888 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
890 .globl rsaz_512_mul_gather4
891 .type rsaz_512_mul_gather4
,\
@function,6
893 rsaz_512_mul_gather4
:
904 $code.=<<___
if ($addx);
906 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
907 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
911 movl
64($bp,$pwr,4), %eax
912 movq
$out, %xmm0 # off-load arguments
913 movl
($bp,$pwr,4), %ebx
921 leaq
128($bp,$pwr,4), %rbp
922 mulq
%rbx # 0 iteration
1049 leaq
128(%rbp), %rbp
1053 jnz
.Loop_mul_gather
1076 call __rsaz_512_reduce
1078 $code.=<<___
if ($addx);
1079 jmp
.Lmul_gather_tail
1083 mov
64($bp,$pwr,4), %eax
1084 movq
$out, %xmm0 # off-load arguments
1085 lea
128($bp,$pwr,4), %rbp
1086 mov
($bp,$pwr,4), %edx
1092 mulx
($ap), %rbx, %r8 # 0 iteration
1094 xor %edi, %edi # cf=0, of=0
1096 mulx
8($ap), %rax, %r9
1099 mulx
16($ap), %rbx, %r10
1100 movd
64(%rbp), %xmm5
1103 mulx
24($ap), %rax, %r11
1107 mulx
32($ap), %rbx, %r12
1111 mulx
40($ap), %rax, %r13
1114 mulx
48($ap), %rbx, %r14
1118 mulx
56($ap), %rax, %r15
1123 adcx
%rdi, %r15 # %rdi is 0
1126 jmp
.Loop_mulx_gather
1130 mulx
($ap), %rax, %r8
1134 mulx
8($ap), %rax, %r9
1135 .byte
0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
1139 mulx
16($ap), %rax, %r10
1140 movd
64(%rbp), %xmm5
1145 .byte
0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1151 mulx
32($ap), %rax, %r12
1155 mulx
40($ap), %rax, %r13
1159 .byte
0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1163 mulx
56($ap), %rax, %r15
1165 mov
%rbx, 64(%rsp,%rcx,8)
1169 adcx
%rdi, %r15 # cf=0
1172 jnz
.Loop_mulx_gather
1176 mov
%r10, 64+16(%rsp)
1177 mov
%r11, 64+24(%rsp)
1178 mov
%r12, 64+32(%rsp)
1179 mov
%r13, 64+40(%rsp)
1180 mov
%r14, 64+48(%rsp)
1181 mov
%r15, 64+56(%rsp)
1186 mov
128(%rsp), %rdx # pull $n0
1196 call __rsaz_512_reducex
1206 adcq
104(%rsp), %r13
1207 adcq
112(%rsp), %r14
1208 adcq
120(%rsp), %r15
1211 call __rsaz_512_subtract
1213 leaq
128+24+48(%rsp), %rax
1214 movq
-48(%rax), %r15
1215 movq
-40(%rax), %r14
1216 movq
-32(%rax), %r13
1217 movq
-24(%rax), %r12
1218 movq
-16(%rax), %rbp
1221 .Lmul_gather4_epilogue
:
1223 .size rsaz_512_mul_gather4
,.-rsaz_512_mul_gather4
1227 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1229 .globl rsaz_512_mul_scatter4
1230 .type rsaz_512_mul_scatter4
,\
@function,6
1232 rsaz_512_mul_scatter4
:
1241 .Lmul_scatter4_body
:
1242 leaq
($tbl,$pwr,4), $tbl
1243 movq
$out, %xmm0 # off-load arguments
1250 $code.=<<___
if ($addx);
1251 movl \
$0x80100,%r11d
1252 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
1253 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
1257 movq
($out),%rbx # pass b[0]
1272 call __rsaz_512_reduce
1274 $code.=<<___
if ($addx);
1275 jmp
.Lmul_scatter_tail
1279 movq
($out), %rdx # pass b[0]
1280 call __rsaz_512_mulx
1285 movq
128(%rsp), %rdx # pull $n0
1295 call __rsaz_512_reducex
1305 adcq
104(%rsp), %r13
1306 adcq
112(%rsp), %r14
1307 adcq
120(%rsp), %r15
1311 call __rsaz_512_subtract
1313 movl
%r8d, 64*0($inp) # scatter
1315 movl
%r9d, 64*2($inp)
1317 movl
%r10d, 64*4($inp)
1319 movl
%r11d, 64*6($inp)
1321 movl
%r12d, 64*8($inp)
1323 movl
%r13d, 64*10($inp)
1325 movl
%r14d, 64*12($inp)
1327 movl
%r15d, 64*14($inp)
1329 movl
%r8d, 64*1($inp)
1330 movl
%r9d, 64*3($inp)
1331 movl
%r10d, 64*5($inp)
1332 movl
%r11d, 64*7($inp)
1333 movl
%r12d, 64*9($inp)
1334 movl
%r13d, 64*11($inp)
1335 movl
%r14d, 64*13($inp)
1336 movl
%r15d, 64*15($inp)
1338 leaq
128+24+48(%rsp), %rax
1339 movq
-48(%rax), %r15
1340 movq
-40(%rax), %r14
1341 movq
-32(%rax), %r13
1342 movq
-24(%rax), %r12
1343 movq
-16(%rax), %rbp
1346 .Lmul_scatter4_epilogue
:
1348 .size rsaz_512_mul_scatter4
,.-rsaz_512_mul_scatter4
1352 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1354 .globl rsaz_512_mul_by_one
1355 .type rsaz_512_mul_by_one
,\
@function,4
1357 rsaz_512_mul_by_one
:
1368 $code.=<<___
if ($addx);
1369 movl OPENSSL_ia32cap_P
+8(%rip),%eax
1372 movq
$mod, %rbp # reassign argument
1385 movdqa
%xmm0, (%rsp)
1386 movdqa
%xmm0, 16(%rsp)
1387 movdqa
%xmm0, 32(%rsp)
1388 movdqa
%xmm0, 48(%rsp)
1389 movdqa
%xmm0, 64(%rsp)
1390 movdqa
%xmm0, 80(%rsp)
1391 movdqa
%xmm0, 96(%rsp)
1393 $code.=<<___
if ($addx);
1395 cmpl \
$0x80100,%eax # check for MULX and ADO/CX
1399 call __rsaz_512_reduce
1401 $code.=<<___
if ($addx);
1405 movq
128(%rsp), %rdx # pull $n0
1406 call __rsaz_512_reducex
1419 leaq
128+24+48(%rsp), %rax
1420 movq
-48(%rax), %r15
1421 movq
-40(%rax), %r14
1422 movq
-32(%rax), %r13
1423 movq
-24(%rax), %r12
1424 movq
-16(%rax), %rbp
1427 .Lmul_by_one_epilogue
:
1429 .size rsaz_512_mul_by_one
,.-rsaz_512_mul_by_one
1432 { # __rsaz_512_reduce
1434 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1436 # clobbers: everything except %rbp and %rdi
1438 .type __rsaz_512_reduce
,\
@abi-omnipotent
1442 imulq
128+8(%rsp), %rbx
1445 jmp
.Lreduction_loop
1476 movq
128+8(%rsp), %rsi
1517 jne
.Lreduction_loop
1520 .size __rsaz_512_reduce
,.-__rsaz_512_reduce
1524 # __rsaz_512_reducex
1526 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1528 # clobbers: everything except %rbp and %rdi
1530 .type __rsaz_512_reducex
,\
@abi-omnipotent
1533 #movq 128+8(%rsp), %rdx # pull $n0
1535 xorq
%rsi, %rsi # cf=0,of=0
1537 jmp
.Lreduction_loopx
1542 mulx
0(%rbp), %rax, %r8
1546 mulx
8(%rbp), %rax, %r9
1550 mulx
16(%rbp), %rbx, %r10
1554 mulx
24(%rbp), %rbx, %r11
1558 .byte
0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1564 mulx
128+8(%rsp), %rbx, %rdx
1567 mulx
40(%rbp), %rax, %r13
1571 .byte
0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1575 mulx
56(%rbp), %rax, %r15
1578 adox
%rsi, %r15 # %rsi is 0
1579 adcx
%rsi, %r15 # cf=0
1582 jne
.Lreduction_loopx
1585 .size __rsaz_512_reducex
,.-__rsaz_512_reducex
1588 { # __rsaz_512_subtract
1589 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1591 # clobbers: everything but %rdi, %rsi and %rbp
1593 .type __rsaz_512_subtract
,\
@abi-omnipotent
1595 __rsaz_512_subtract
:
1649 .size __rsaz_512_subtract
,.-__rsaz_512_subtract
1654 # input: %rsi - ap, %rbp - bp
1656 # clobbers: everything
1657 my ($ap,$bp) = ("%rsi","%rbp");
1659 .type __rsaz_512_mul
,\
@abi-omnipotent
1800 .size __rsaz_512_mul
,.-__rsaz_512_mul
1806 # input: %rsi - ap, %rbp - bp
1808 # clobbers: everything
1809 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1811 .type __rsaz_512_mulx
,\
@abi-omnipotent
1814 mulx
($ap), %rbx, %r8 # initial %rdx preloaded by caller
1815 xor $zero, $zero # cf=0,of=0
1817 mulx
8($ap), %rax, %r9
1820 mulx
16($ap), %rbx, %r10
1823 mulx
24($ap), %rax, %r11
1826 .byte
0xc4,0x62,0xe3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rbx, %r12
1829 mulx
40($ap), %rax, %r13
1832 mulx
48($ap), %rbx, %r14
1835 mulx
56($ap), %rax, %r15
1839 adcx
$zero, %r15 # cf=0
1847 mulx
($ap), %rax, %r8
1851 mulx
8($ap), %rax, %r9
1855 mulx
16($ap), %rax, %r10
1859 mulx
24($ap), %rax, %r11
1863 .byte
0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1867 mulx
40($ap), %rax, %r13
1871 mulx
48($ap), %rax, %r14
1875 mulx
56($ap), %rax, %r15
1876 movq
64($bp,%rcx,8), %rdx
1877 movq
%rbx, 8+64-8(%rsp,%rcx,8)
1880 adcx
$zero, %r15 # cf=0
1886 mulx
($ap), %rax, %r8
1890 .byte
0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
1894 .byte
0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
1898 mulx
24($ap), %rax, %r11
1902 mulx
32($ap), %rax, %r12
1906 mulx
40($ap), %rax, %r13
1910 .byte
0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1914 .byte
0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
1919 mov
%rbx, 8+64-8(%rsp)
1921 mov
%r9, 8+64+8(%rsp)
1922 mov
%r10, 8+64+16(%rsp)
1923 mov
%r11, 8+64+24(%rsp)
1924 mov
%r12, 8+64+32(%rsp)
1925 mov
%r13, 8+64+40(%rsp)
1926 mov
%r14, 8+64+48(%rsp)
1927 mov
%r15, 8+64+56(%rsp)
1930 .size __rsaz_512_mulx
,.-__rsaz_512_mulx
1934 my ($out,$inp,$power)= $win64 ?
("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1936 .globl rsaz_512_scatter4
1937 .type rsaz_512_scatter4
,\
@abi-omnipotent
1940 leaq
($out,$power,4), $out
1950 leaq
128($out), $out
1954 .size rsaz_512_scatter4
,.-rsaz_512_scatter4
1956 .globl rsaz_512_gather4
1957 .type rsaz_512_gather4
,\
@abi-omnipotent
1960 leaq
($inp,$power,4), $inp
1967 leaq
128($inp), $inp
1975 .size rsaz_512_gather4
,.-rsaz_512_gather4
1979 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1980 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1988 .extern __imp_RtlVirtualUnwind
1989 .type se_handler
,\
@abi-omnipotent
2003 mov
120($context),%rax # pull context->Rax
2004 mov
248($context),%rbx # pull context->Rip
2006 mov
8($disp),%rsi # disp->ImageBase
2007 mov
56($disp),%r11 # disp->HandlerData
2009 mov
0(%r11),%r10d # HandlerData[0]
2010 lea
(%rsi,%r10),%r10 # end of prologue label
2011 cmp %r10,%rbx # context->Rip<end of prologue label
2012 jb
.Lcommon_seh_tail
2014 mov
152($context),%rax # pull context->Rsp
2016 mov
4(%r11),%r10d # HandlerData[1]
2017 lea
(%rsi,%r10),%r10 # epilogue label
2018 cmp %r10,%rbx # context->Rip>=epilogue label
2019 jae
.Lcommon_seh_tail
2021 lea
128+24+48(%rax),%rax
2029 mov
%rbx,144($context) # restore context->Rbx
2030 mov
%rbp,160($context) # restore context->Rbp
2031 mov
%r12,216($context) # restore context->R12
2032 mov
%r13,224($context) # restore context->R13
2033 mov
%r14,232($context) # restore context->R14
2034 mov
%r15,240($context) # restore context->R15
2039 mov
%rax,152($context) # restore context->Rsp
2040 mov
%rsi,168($context) # restore context->Rsi
2041 mov
%rdi,176($context) # restore context->Rdi
2043 mov
40($disp),%rdi # disp->ContextRecord
2044 mov
$context,%rsi # context
2045 mov \
$154,%ecx # sizeof(CONTEXT)
2046 .long
0xa548f3fc # cld; rep movsq
2049 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2050 mov
8(%rsi),%rdx # arg2, disp->ImageBase
2051 mov
0(%rsi),%r8 # arg3, disp->ControlPc
2052 mov
16(%rsi),%r9 # arg4, disp->FunctionEntry
2053 mov
40(%rsi),%r10 # disp->ContextRecord
2054 lea
56(%rsi),%r11 # &disp->HandlerData
2055 lea
24(%rsi),%r12 # &disp->EstablisherFrame
2056 mov
%r10,32(%rsp) # arg5
2057 mov
%r11,40(%rsp) # arg6
2058 mov
%r12,48(%rsp) # arg7
2059 mov
%rcx,56(%rsp) # arg8, (NULL)
2060 call
*__imp_RtlVirtualUnwind
(%rip)
2062 mov \
$1,%eax # ExceptionContinueSearch
2074 .size sqr_handler
,.-sqr_handler
2078 .rva
.LSEH_begin_rsaz_512_sqr
2079 .rva
.LSEH_end_rsaz_512_sqr
2080 .rva
.LSEH_info_rsaz_512_sqr
2082 .rva
.LSEH_begin_rsaz_512_mul
2083 .rva
.LSEH_end_rsaz_512_mul
2084 .rva
.LSEH_info_rsaz_512_mul
2086 .rva
.LSEH_begin_rsaz_512_mul_gather4
2087 .rva
.LSEH_end_rsaz_512_mul_gather4
2088 .rva
.LSEH_info_rsaz_512_mul_gather4
2090 .rva
.LSEH_begin_rsaz_512_mul_scatter4
2091 .rva
.LSEH_end_rsaz_512_mul_scatter4
2092 .rva
.LSEH_info_rsaz_512_mul_scatter4
2094 .rva
.LSEH_begin_rsaz_512_mul_by_one
2095 .rva
.LSEH_end_rsaz_512_mul_by_one
2096 .rva
.LSEH_info_rsaz_512_mul_by_one
2100 .LSEH_info_rsaz_512_sqr
:
2103 .rva
.Lsqr_body
,.Lsqr_epilogue
# HandlerData[]
2104 .LSEH_info_rsaz_512_mul
:
2107 .rva
.Lmul_body
,.Lmul_epilogue
# HandlerData[]
2108 .LSEH_info_rsaz_512_mul_gather4
:
2111 .rva
.Lmul_gather4_body
,.Lmul_gather4_epilogue
# HandlerData[]
2112 .LSEH_info_rsaz_512_mul_scatter4
:
2115 .rva
.Lmul_scatter4_body
,.Lmul_scatter4_epilogue
# HandlerData[]
2116 .LSEH_info_rsaz_512_mul_by_one
:
2119 .rva
.Lmul_by_one_body
,.Lmul_by_one_epilogue
# HandlerData[]
2123 $code =~ s/\`([^\`]*)\`/eval $1/gem;