3 ##############################################################################
5 # Copyright (c) 2012, Intel Corporation #
7 # All rights reserved. #
9 # Redistribution and use in source and binary forms, with or without #
10 # modification, are permitted provided that the following conditions are #
13 # * Redistributions of source code must retain the above copyright #
14 # notice, this list of conditions and the following disclaimer. #
16 # * Redistributions in binary form must reproduce the above copyright #
17 # notice, this list of conditions and the following disclaimer in the #
18 # documentation and/or other materials provided with the #
21 # * Neither the name of the Intel Corporation nor the names of its #
22 # contributors may be used to endorse or promote products derived from #
23 # this software without specific prior written permission. #
26 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
38 ##############################################################################
39 # Developers and authors: #
40 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
41 # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
42 # Israel Development Center, Haifa, Israel #
43 # (2) University of Haifa #
44 ##############################################################################
46 # [1] S. Gueron, "Efficient Software Implementations of Modular #
47 # Exponentiation", http://eprint.iacr.org/2011/239 #
48 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
49 # IEEE Proceedings of 9th International Conference on Information #
50 # Technology: New Generations (ITNG 2012), 821-823 (2012). #
51 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
52 # Journal of Cryptographic Engineering 2:31-43 (2012). #
53 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
54 # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
55 # RSA1024 and RSA2048 on x86_64 platforms", #
56 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
57 ##############################################################################
59 # While original submission covers 512- and 1024-bit exponentiation,
60 # this module is limited to 512-bit version only (and as such
61 # accelerates RSA1024 sign). This is because improvement for longer
62 # keys is not high enough to justify the effort, highest measured
63 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
64 # for the moment of this writing!] Nor does this module implement
65 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
66 # to more modular mixture of C and assembly. And it's optimized even
67 # for processors other than Intel Core family (see table below for
68 # improvement coefficients).
71 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
72 # ----------------+---------------------------
73 # Opteron +13% |+5% +20%
74 # Bulldozer -0% |-1% +10%
76 # Westmere +5% |+14% +17%
77 # Sandy Bridge +2% |+12% +29%
78 # Ivy Bridge +1% |+11% +35%
79 # Haswell(**) -0% |+12% +39%
81 # VIA Nano +70% |+9% +25%
83 # (*) rsax engine and fips numbers are presented for reference
85 # (**) MULX was attempted, but found to give only marginal improvement;
89 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
91 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
93 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
94 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
95 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f
$xlate) or
96 die "can't locate x86_64-xlate.pl";
98 open OUT
,"| \"$^X\" $xlate $flavour $output";
101 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
106 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM
} =~ /nasm/) &&
107 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
111 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM
} =~ /ml64/) &&
112 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
116 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
117 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
118 $addx = ($ver>=3.03);
121 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
123 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
128 .extern OPENSSL_ia32cap_P
131 .type rsaz_512_sqr
,\
@function,5
133 rsaz_512_sqr
: # 25-29% faster than rsaz_512_mul
143 movq
$mod, %rbp # common argument
148 $code.=<<___
if ($addx);
150 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
151 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
159 movl
$times,128+8(%rsp)
203 addq
%r8, %r8 #shlq \$1, %r8
205 adcq
%r9, %r9 #shld \$1, %r8, %r9
266 lea
(%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
268 adcq
%r11, %r11 #shld \$1, %r10, %r11
306 lea
(%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
324 leaq
(%r10,%r13,2), %r13 #shld \$1, %r12, %r13
354 leaq
(%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
372 leaq
(%r12,%r15,2),%r15 #shld \$1, %r14, %r15
397 leaq
(%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
412 leaq
(%r12,%r9,2), %r9 #shld \$1, %r8, %r9
436 leaq
(%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
444 leaq
(%r15,%r11,2), %r11 #shld \$1, %r10, %r11
465 adcq
%r12, %r12 #shld \$1, %rbx, %r12
466 adcq
%r13, %r13 #shld \$1, %r12, %r13
467 adcq
%r14, %r14 #shld \$1, %r13, %r14
497 call __rsaz_512_reduce
509 call __rsaz_512_subtract
513 movl
128+8(%rsp), $times
525 movl
$times,128+8(%rsp)
526 movq
$out, %xmm0 # off-load
527 movq
%rbp, %xmm1 # off-load
531 mulx
16($inp), %rcx, %r10
532 xor %rbp, %rbp # cf=0, of=0
534 mulx
24($inp), %rax, %r11
537 mulx
32($inp), %rcx, %r12
540 mulx
40($inp), %rax, %r13
543 .byte
0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
547 .byte
0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
549 adcx
%rbp, %r15 # %rbp is 0
556 mulx
%rdx, %rax, %rdx
565 mulx
16($inp), %rax, %rbx
569 .byte
0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
573 mulx
32($inp), %rax, %rbx
577 mulx
40($inp), $out, %r8
581 .byte
0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
585 .byte
0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
595 mulx
%rdx, %rax, %rcx
602 .byte
0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
605 .byte
0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
609 mulx
32($inp), %rax, %rcx
613 mulx
40($inp), $out, %r9
617 .byte
0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
621 .byte
0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
631 mulx
%rdx, %rax, %rdx
638 .byte
0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
641 .byte
0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
645 mulx
40($inp), $out, %r10
649 mulx
48($inp), %rax, %rbx
653 mulx
56($inp), $out, %r10
664 mulx
%rdx, %rax, %rdx
674 .byte
0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
678 mulx
48($inp), %rax, %rcx
682 mulx
56($inp), $out, %r11
692 mulx
%rdx, %rax, %rdx
702 .byte
0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
706 .byte
0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
716 mulx
%rdx, %rax, %rdx
726 .byte
0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
736 mulx
%rdx, %rax, %rdx
742 .byte
0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
743 .byte
0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
746 mulx
%rdx, %rax, %rdx
758 movq
128(%rsp), %rdx # pull $n0
768 call __rsaz_512_reducex
780 call __rsaz_512_subtract
784 movl
128+8(%rsp), $times
795 leaq
128+24+48(%rsp), %rax
805 .size rsaz_512_sqr
,.-rsaz_512_sqr
809 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
812 .type rsaz_512_mul
,\
@function,5
824 movq
$out, %xmm0 # off-load arguments
828 $code.=<<___
if ($addx);
830 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
831 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
835 movq
($bp), %rbx # pass b[0]
836 movq
$bp, %rbp # pass argument
851 call __rsaz_512_reduce
853 $code.=<<___
if ($addx);
858 movq
$bp, %rbp # pass argument
859 movq
($bp), %rdx # pass b[0]
865 movq
128(%rsp), %rdx # pull $n0
875 call __rsaz_512_reducex
889 call __rsaz_512_subtract
891 leaq
128+24+48(%rsp), %rax
901 .size rsaz_512_mul
,.-rsaz_512_mul
905 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
907 .globl rsaz_512_mul_gather4
908 .type rsaz_512_mul_gather4
,\
@function,6
910 rsaz_512_mul_gather4
:
922 $code.=<<___
if ($addx);
924 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
925 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
929 movl
64($bp,$pwr,4), %eax
930 movq
$out, %xmm0 # off-load arguments
931 movl
($bp,$pwr,4), %ebx
939 leaq
128($bp,$pwr,4), %rbp
940 mulq
%rbx # 0 iteration
1016 movd
64(%rbp), %xmm5
1067 leaq
128(%rbp), %rbp
1071 jnz
.Loop_mul_gather
1094 call __rsaz_512_reduce
1096 $code.=<<___
if ($addx);
1097 jmp
.Lmul_gather_tail
1101 mov
64($bp,$pwr,4), %eax
1102 movq
$out, %xmm0 # off-load arguments
1103 lea
128($bp,$pwr,4), %rbp
1104 mov
($bp,$pwr,4), %edx
1110 mulx
($ap), %rbx, %r8 # 0 iteration
1112 xor %edi, %edi # cf=0, of=0
1114 mulx
8($ap), %rax, %r9
1117 mulx
16($ap), %rbx, %r10
1118 movd
64(%rbp), %xmm5
1121 mulx
24($ap), %rax, %r11
1125 mulx
32($ap), %rbx, %r12
1129 mulx
40($ap), %rax, %r13
1132 mulx
48($ap), %rbx, %r14
1136 mulx
56($ap), %rax, %r15
1141 adcx
%rdi, %r15 # %rdi is 0
1144 jmp
.Loop_mulx_gather
1148 mulx
($ap), %rax, %r8
1152 mulx
8($ap), %rax, %r9
1153 .byte
0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
1157 mulx
16($ap), %rax, %r10
1158 movd
64(%rbp), %xmm5
1163 .byte
0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1169 mulx
32($ap), %rax, %r12
1173 mulx
40($ap), %rax, %r13
1177 .byte
0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1181 mulx
56($ap), %rax, %r15
1183 mov
%rbx, 64(%rsp,%rcx,8)
1187 adcx
%rdi, %r15 # cf=0
1190 jnz
.Loop_mulx_gather
1194 mov
%r10, 64+16(%rsp)
1195 mov
%r11, 64+24(%rsp)
1196 mov
%r12, 64+32(%rsp)
1197 mov
%r13, 64+40(%rsp)
1198 mov
%r14, 64+48(%rsp)
1199 mov
%r15, 64+56(%rsp)
1204 mov
128(%rsp), %rdx # pull $n0
1214 call __rsaz_512_reducex
1224 adcq
104(%rsp), %r13
1225 adcq
112(%rsp), %r14
1226 adcq
120(%rsp), %r15
1229 call __rsaz_512_subtract
1231 leaq
128+24+48(%rsp), %rax
1232 movq
-48(%rax), %r15
1233 movq
-40(%rax), %r14
1234 movq
-32(%rax), %r13
1235 movq
-24(%rax), %r12
1236 movq
-16(%rax), %rbp
1239 .Lmul_gather4_epilogue
:
1241 .size rsaz_512_mul_gather4
,.-rsaz_512_mul_gather4
1245 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1247 .globl rsaz_512_mul_scatter4
1248 .type rsaz_512_mul_scatter4
,\
@function,6
1250 rsaz_512_mul_scatter4
:
1260 .Lmul_scatter4_body
:
1261 leaq
($tbl,$pwr,4), $tbl
1262 movq
$out, %xmm0 # off-load arguments
1269 $code.=<<___
if ($addx);
1270 movl \
$0x80100,%r11d
1271 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
1272 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
1276 movq
($out),%rbx # pass b[0]
1291 call __rsaz_512_reduce
1293 $code.=<<___
if ($addx);
1294 jmp
.Lmul_scatter_tail
1298 movq
($out), %rdx # pass b[0]
1299 call __rsaz_512_mulx
1304 movq
128(%rsp), %rdx # pull $n0
1314 call __rsaz_512_reducex
1324 adcq
104(%rsp), %r13
1325 adcq
112(%rsp), %r14
1326 adcq
120(%rsp), %r15
1330 call __rsaz_512_subtract
1332 movl
%r8d, 64*0($inp) # scatter
1334 movl
%r9d, 64*2($inp)
1336 movl
%r10d, 64*4($inp)
1338 movl
%r11d, 64*6($inp)
1340 movl
%r12d, 64*8($inp)
1342 movl
%r13d, 64*10($inp)
1344 movl
%r14d, 64*12($inp)
1346 movl
%r15d, 64*14($inp)
1348 movl
%r8d, 64*1($inp)
1349 movl
%r9d, 64*3($inp)
1350 movl
%r10d, 64*5($inp)
1351 movl
%r11d, 64*7($inp)
1352 movl
%r12d, 64*9($inp)
1353 movl
%r13d, 64*11($inp)
1354 movl
%r14d, 64*13($inp)
1355 movl
%r15d, 64*15($inp)
1357 leaq
128+24+48(%rsp), %rax
1358 movq
-48(%rax), %r15
1359 movq
-40(%rax), %r14
1360 movq
-32(%rax), %r13
1361 movq
-24(%rax), %r12
1362 movq
-16(%rax), %rbp
1365 .Lmul_scatter4_epilogue
:
1367 .size rsaz_512_mul_scatter4
,.-rsaz_512_mul_scatter4
1371 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1373 .globl rsaz_512_mul_by_one
1374 .type rsaz_512_mul_by_one
,\
@function,4
1376 rsaz_512_mul_by_one
:
1387 $code.=<<___
if ($addx);
1388 movl OPENSSL_ia32cap_P
+8(%rip),%eax
1391 movq
$mod, %rbp # reassign argument
1404 movdqa
%xmm0, (%rsp)
1405 movdqa
%xmm0, 16(%rsp)
1406 movdqa
%xmm0, 32(%rsp)
1407 movdqa
%xmm0, 48(%rsp)
1408 movdqa
%xmm0, 64(%rsp)
1409 movdqa
%xmm0, 80(%rsp)
1410 movdqa
%xmm0, 96(%rsp)
1412 $code.=<<___
if ($addx);
1414 cmpl \
$0x80100,%eax # check for MULX and ADO/CX
1418 call __rsaz_512_reduce
1420 $code.=<<___
if ($addx);
1424 movq
128(%rsp), %rdx # pull $n0
1425 call __rsaz_512_reducex
1438 leaq
128+24+48(%rsp), %rax
1439 movq
-48(%rax), %r15
1440 movq
-40(%rax), %r14
1441 movq
-32(%rax), %r13
1442 movq
-24(%rax), %r12
1443 movq
-16(%rax), %rbp
1446 .Lmul_by_one_epilogue
:
1448 .size rsaz_512_mul_by_one
,.-rsaz_512_mul_by_one
1451 { # __rsaz_512_reduce
1453 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1455 # clobbers: everything except %rbp and %rdi
1457 .type __rsaz_512_reduce
,\
@abi-omnipotent
1461 imulq
128+8(%rsp), %rbx
1464 jmp
.Lreduction_loop
1495 movq
128+8(%rsp), %rsi
1536 jne
.Lreduction_loop
1539 .size __rsaz_512_reduce
,.-__rsaz_512_reduce
1543 # __rsaz_512_reducex
1545 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1547 # clobbers: everything except %rbp and %rdi
1549 .type __rsaz_512_reducex
,\
@abi-omnipotent
1552 #movq 128+8(%rsp), %rdx # pull $n0
1554 xorq
%rsi, %rsi # cf=0,of=0
1556 jmp
.Lreduction_loopx
1561 mulx
0(%rbp), %rax, %r8
1565 mulx
8(%rbp), %rax, %r9
1569 mulx
16(%rbp), %rbx, %r10
1573 mulx
24(%rbp), %rbx, %r11
1577 .byte
0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1583 mulx
128+8(%rsp), %rbx, %rdx
1586 mulx
40(%rbp), %rax, %r13
1590 .byte
0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1594 mulx
56(%rbp), %rax, %r15
1597 adox
%rsi, %r15 # %rsi is 0
1598 adcx
%rsi, %r15 # cf=0
1601 jne
.Lreduction_loopx
1604 .size __rsaz_512_reducex
,.-__rsaz_512_reducex
1607 { # __rsaz_512_subtract
1608 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1610 # clobbers: everything but %rdi, %rsi and %rbp
1612 .type __rsaz_512_subtract
,\
@abi-omnipotent
1614 __rsaz_512_subtract
:
1668 .size __rsaz_512_subtract
,.-__rsaz_512_subtract
1673 # input: %rsi - ap, %rbp - bp
1675 # clobbers: everything
1676 my ($ap,$bp) = ("%rsi","%rbp");
1678 .type __rsaz_512_mul
,\
@abi-omnipotent
1819 .size __rsaz_512_mul
,.-__rsaz_512_mul
1825 # input: %rsi - ap, %rbp - bp
1827 # clobbers: everything
1828 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1830 .type __rsaz_512_mulx
,\
@abi-omnipotent
1833 mulx
($ap), %rbx, %r8 # initial %rdx preloaded by caller
1836 mulx
8($ap), %rax, %r9
1839 mulx
16($ap), %rbx, %r10
1842 mulx
24($ap), %rax, %r11
1845 mulx
32($ap), %rbx, %r12
1848 mulx
40($ap), %rax, %r13
1851 mulx
48($ap), %rbx, %r14
1854 mulx
56($ap), %rax, %r15
1860 xor $zero, $zero # cf=0,of=0
1866 mulx
($ap), %rax, %r8
1870 mulx
8($ap), %rax, %r9
1874 mulx
16($ap), %rax, %r10
1878 mulx
24($ap), %rax, %r11
1882 .byte
0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1886 mulx
40($ap), %rax, %r13
1890 mulx
48($ap), %rax, %r14
1894 mulx
56($ap), %rax, %r15
1895 movq
64($bp,%rcx,8), %rdx
1896 movq
%rbx, 8+64-8(%rsp,%rcx,8)
1899 adcx
$zero, %r15 # cf=0
1905 mulx
($ap), %rax, %r8
1909 .byte
0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
1913 .byte
0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
1917 mulx
24($ap), %rax, %r11
1921 mulx
32($ap), %rax, %r12
1925 mulx
40($ap), %rax, %r13
1929 .byte
0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1933 .byte
0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
1938 mov
%rbx, 8+64-8(%rsp)
1940 mov
%r9, 8+64+8(%rsp)
1941 mov
%r10, 8+64+16(%rsp)
1942 mov
%r11, 8+64+24(%rsp)
1943 mov
%r12, 8+64+32(%rsp)
1944 mov
%r13, 8+64+40(%rsp)
1945 mov
%r14, 8+64+48(%rsp)
1946 mov
%r15, 8+64+56(%rsp)
1949 .size __rsaz_512_mulx
,.-__rsaz_512_mulx
1953 my ($out,$inp,$power)= $win64 ?
("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1955 .globl rsaz_512_scatter4
1956 .type rsaz_512_scatter4
,\
@abi-omnipotent
1959 leaq
($out,$power,4), $out
1969 leaq
128($out), $out
1973 .size rsaz_512_scatter4
,.-rsaz_512_scatter4
1975 .globl rsaz_512_gather4
1976 .type rsaz_512_gather4
,\
@abi-omnipotent
1979 leaq
($inp,$power,4), $inp
1986 leaq
128($inp), $inp
1994 .size rsaz_512_gather4
,.-rsaz_512_gather4
1998 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1999 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2007 .extern __imp_RtlVirtualUnwind
2008 .type se_handler
,\
@abi-omnipotent
2022 mov
120($context),%rax # pull context->Rax
2023 mov
248($context),%rbx # pull context->Rip
2025 mov
8($disp),%rsi # disp->ImageBase
2026 mov
56($disp),%r11 # disp->HandlerData
2028 mov
0(%r11),%r10d # HandlerData[0]
2029 lea
(%rsi,%r10),%r10 # end of prologue label
2030 cmp %r10,%rbx # context->Rip<end of prologue label
2031 jb
.Lcommon_seh_tail
2033 mov
152($context),%rax # pull context->Rsp
2035 mov
4(%r11),%r10d # HandlerData[1]
2036 lea
(%rsi,%r10),%r10 # epilogue label
2037 cmp %r10,%rbx # context->Rip>=epilogue label
2038 jae
.Lcommon_seh_tail
2040 lea
128+24+48(%rax),%rax
2048 mov
%rbx,144($context) # restore context->Rbx
2049 mov
%rbp,160($context) # restore context->Rbp
2050 mov
%r12,216($context) # restore context->R12
2051 mov
%r13,224($context) # restore context->R13
2052 mov
%r14,232($context) # restore context->R14
2053 mov
%r15,240($context) # restore context->R15
2058 mov
%rax,152($context) # restore context->Rsp
2059 mov
%rsi,168($context) # restore context->Rsi
2060 mov
%rdi,176($context) # restore context->Rdi
2062 mov
40($disp),%rdi # disp->ContextRecord
2063 mov
$context,%rsi # context
2064 mov \
$154,%ecx # sizeof(CONTEXT)
2065 .long
0xa548f3fc # cld; rep movsq
2068 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2069 mov
8(%rsi),%rdx # arg2, disp->ImageBase
2070 mov
0(%rsi),%r8 # arg3, disp->ControlPc
2071 mov
16(%rsi),%r9 # arg4, disp->FunctionEntry
2072 mov
40(%rsi),%r10 # disp->ContextRecord
2073 lea
56(%rsi),%r11 # &disp->HandlerData
2074 lea
24(%rsi),%r12 # &disp->EstablisherFrame
2075 mov
%r10,32(%rsp) # arg5
2076 mov
%r11,40(%rsp) # arg6
2077 mov
%r12,48(%rsp) # arg7
2078 mov
%rcx,56(%rsp) # arg8, (NULL)
2079 call
*__imp_RtlVirtualUnwind
(%rip)
2081 mov \
$1,%eax # ExceptionContinueSearch
2093 .size sqr_handler
,.-sqr_handler
2097 .rva
.LSEH_begin_rsaz_512_sqr
2098 .rva
.LSEH_end_rsaz_512_sqr
2099 .rva
.LSEH_info_rsaz_512_sqr
2101 .rva
.LSEH_begin_rsaz_512_mul
2102 .rva
.LSEH_end_rsaz_512_mul
2103 .rva
.LSEH_info_rsaz_512_mul
2105 .rva
.LSEH_begin_rsaz_512_mul_gather4
2106 .rva
.LSEH_end_rsaz_512_mul_gather4
2107 .rva
.LSEH_info_rsaz_512_mul_gather4
2109 .rva
.LSEH_begin_rsaz_512_mul_scatter4
2110 .rva
.LSEH_end_rsaz_512_mul_scatter4
2111 .rva
.LSEH_info_rsaz_512_mul_scatter4
2113 .rva
.LSEH_begin_rsaz_512_mul_by_one
2114 .rva
.LSEH_end_rsaz_512_mul_by_one
2115 .rva
.LSEH_info_rsaz_512_mul_by_one
2119 .LSEH_info_rsaz_512_sqr
:
2122 .rva
.Lsqr_body
,.Lsqr_epilogue
# HandlerData[]
2123 .LSEH_info_rsaz_512_mul
:
2126 .rva
.Lmul_body
,.Lmul_epilogue
# HandlerData[]
2127 .LSEH_info_rsaz_512_mul_gather4
:
2130 .rva
.Lmul_gather4_body
,.Lmul_gather4_epilogue
# HandlerData[]
2131 .LSEH_info_rsaz_512_mul_scatter4
:
2134 .rva
.Lmul_scatter4_body
,.Lmul_scatter4_epilogue
# HandlerData[]
2135 .LSEH_info_rsaz_512_mul_by_one
:
2138 .rva
.Lmul_by_one_body
,.Lmul_by_one_epilogue
# HandlerData[]
2142 $code =~ s/\`([^\`]*)\`/eval $1/gem;