2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ##############################################################################
12 # Copyright (c) 2012, Intel Corporation #
14 # All rights reserved. #
16 # Redistribution and use in source and binary forms, with or without #
17 # modification, are permitted provided that the following conditions are #
20 # * Redistributions of source code must retain the above copyright #
21 # notice, this list of conditions and the following disclaimer. #
23 # * Redistributions in binary form must reproduce the above copyright #
24 # notice, this list of conditions and the following disclaimer in the #
25 # documentation and/or other materials provided with the #
28 # * Neither the name of the Intel Corporation nor the names of its #
29 # contributors may be used to endorse or promote products derived from #
30 # this software without specific prior written permission. #
33 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
34 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
35 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
36 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
37 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
38 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
39 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
40 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
41 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
42 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
43 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
45 ##############################################################################
46 # Developers and authors: #
47 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
48 # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
49 # Israel Development Center, Haifa, Israel #
50 # (2) University of Haifa #
51 ##############################################################################
53 # [1] S. Gueron, "Efficient Software Implementations of Modular #
54 # Exponentiation", http://eprint.iacr.org/2011/239 #
55 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
56 # IEEE Proceedings of 9th International Conference on Information #
57 # Technology: New Generations (ITNG 2012), 821-823 (2012). #
58 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
59 # Journal of Cryptographic Engineering 2:31-43 (2012). #
60 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
61 # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
62 # RSA1024 and RSA2048 on x86_64 platforms", #
63 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
64 ##############################################################################
66 # While original submission covers 512- and 1024-bit exponentiation,
67 # this module is limited to 512-bit version only (and as such
68 # accelerates RSA1024 sign). This is because improvement for longer
69 # keys is not high enough to justify the effort, highest measured
70 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
71 # for the moment of this writing!] Nor does this module implement
72 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
73 # to more modular mixture of C and assembly. And it's optimized even
74 # for processors other than Intel Core family (see table below for
75 # improvement coefficients).
78 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
79 # ----------------+---------------------------
80 # Opteron +13% |+5% +20%
81 # Bulldozer -0% |-1% +10%
83 # Westmere +5% |+14% +17%
84 # Sandy Bridge +2% |+12% +29%
85 # Ivy Bridge +1% |+11% +35%
86 # Haswell(**) -0% |+12% +39%
88 # VIA Nano +70% |+9% +25%
90 # (*) rsax engine and fips numbers are presented for reference
92 # (**) MULX was attempted, but found to give only marginal improvement;
96 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
98 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
100 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
101 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
102 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f
$xlate) or
103 die "can't locate x86_64-xlate.pl";
105 open OUT
,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
108 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
109 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
113 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM
} =~ /nasm/) &&
114 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
118 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM
} =~ /ml64/) &&
119 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
123 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
124 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
125 $addx = ($ver>=3.03);
128 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
130 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
135 .extern OPENSSL_ia32cap_P
138 .type rsaz_512_sqr
,\
@function,5
140 rsaz_512_sqr
: # 25-29% faster than rsaz_512_mul
150 movq
$mod, %rbp # common argument
155 $code.=<<___
if ($addx);
157 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
158 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
166 movl
$times,128+8(%rsp)
210 addq
%r8, %r8 #shlq \$1, %r8
212 adcq
%r9, %r9 #shld \$1, %r8, %r9
273 lea
(%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
275 adcq
%r11, %r11 #shld \$1, %r10, %r11
313 lea
(%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
331 leaq
(%r10,%r13,2), %r13 #shld \$1, %r12, %r13
361 leaq
(%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
379 leaq
(%r12,%r15,2),%r15 #shld \$1, %r14, %r15
404 leaq
(%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
419 leaq
(%r12,%r9,2), %r9 #shld \$1, %r8, %r9
443 leaq
(%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
451 leaq
(%r15,%r11,2), %r11 #shld \$1, %r10, %r11
472 adcq
%r12, %r12 #shld \$1, %rbx, %r12
473 adcq
%r13, %r13 #shld \$1, %r12, %r13
474 adcq
%r14, %r14 #shld \$1, %r13, %r14
504 call __rsaz_512_reduce
516 call __rsaz_512_subtract
520 movl
128+8(%rsp), $times
532 movl
$times,128+8(%rsp)
533 movq
$out, %xmm0 # off-load
534 movq
%rbp, %xmm1 # off-load
538 mulx
16($inp), %rcx, %r10
539 xor %rbp, %rbp # cf=0, of=0
541 mulx
24($inp), %rax, %r11
544 mulx
32($inp), %rcx, %r12
547 mulx
40($inp), %rax, %r13
550 .byte
0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
554 .byte
0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
556 adcx
%rbp, %r15 # %rbp is 0
563 mulx
%rdx, %rax, %rdx
572 mulx
16($inp), %rax, %rbx
576 .byte
0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
580 mulx
32($inp), %rax, %rbx
584 mulx
40($inp), $out, %r8
588 .byte
0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
592 .byte
0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
602 mulx
%rdx, %rax, %rcx
609 .byte
0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
612 .byte
0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
616 mulx
32($inp), %rax, %rcx
620 mulx
40($inp), $out, %r9
624 .byte
0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
628 .byte
0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
638 mulx
%rdx, %rax, %rdx
645 .byte
0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
648 .byte
0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
652 mulx
40($inp), $out, %r10
656 mulx
48($inp), %rax, %rbx
660 mulx
56($inp), $out, %r10
671 mulx
%rdx, %rax, %rdx
681 .byte
0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
685 mulx
48($inp), %rax, %rcx
689 mulx
56($inp), $out, %r11
699 mulx
%rdx, %rax, %rdx
709 .byte
0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
713 .byte
0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
723 mulx
%rdx, %rax, %rdx
733 .byte
0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
743 mulx
%rdx, %rax, %rdx
749 .byte
0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
750 .byte
0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
753 mulx
%rdx, %rax, %rdx
765 movq
128(%rsp), %rdx # pull $n0
775 call __rsaz_512_reducex
787 call __rsaz_512_subtract
791 movl
128+8(%rsp), $times
802 leaq
128+24+48(%rsp), %rax
812 .size rsaz_512_sqr
,.-rsaz_512_sqr
816 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
819 .type rsaz_512_mul
,\
@function,5
831 movq
$out, %xmm0 # off-load arguments
835 $code.=<<___
if ($addx);
837 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
838 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
842 movq
($bp), %rbx # pass b[0]
843 movq
$bp, %rbp # pass argument
858 call __rsaz_512_reduce
860 $code.=<<___
if ($addx);
865 movq
$bp, %rbp # pass argument
866 movq
($bp), %rdx # pass b[0]
872 movq
128(%rsp), %rdx # pull $n0
882 call __rsaz_512_reducex
896 call __rsaz_512_subtract
898 leaq
128+24+48(%rsp), %rax
908 .size rsaz_512_mul
,.-rsaz_512_mul
912 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
914 .globl rsaz_512_mul_gather4
915 .type rsaz_512_mul_gather4
,\
@function,6
917 rsaz_512_mul_gather4
:
925 subq \
$`128+24+($win64?0xb0:0)`, %rsp
927 $code.=<<___
if ($win64);
928 movaps
%xmm6,0xa0(%rsp)
929 movaps
%xmm7,0xb0(%rsp)
930 movaps
%xmm8,0xc0(%rsp)
931 movaps
%xmm9,0xd0(%rsp)
932 movaps
%xmm10,0xe0(%rsp)
933 movaps
%xmm11,0xf0(%rsp)
934 movaps
%xmm12,0x100(%rsp)
935 movaps
%xmm13,0x110(%rsp)
936 movaps
%xmm14,0x120(%rsp)
937 movaps
%xmm15,0x130(%rsp)
942 movdqa
.Linc
+16(%rip),%xmm1 # 00000002000000020000000200000002
943 movdqa
.Linc
(%rip),%xmm0 # 00000001000000010000000000000000
945 pshufd \
$0,%xmm8,%xmm8 # broadcast $power
949 ########################################################################
950 # calculate mask by comparing 0..15 to $power
952 for($i=0;$i<4;$i++) {
954 paddd
%xmm`$i`,%xmm`$i+1`
955 pcmpeqd
%xmm8,%xmm`$i`
956 movdqa
%xmm7,%xmm`$i+3`
961 paddd
%xmm`$i`,%xmm`$i+1`
962 pcmpeqd
%xmm8,%xmm`$i`
968 movdqa
16*0($bp),%xmm8
969 movdqa
16*1($bp),%xmm9
970 movdqa
16*2($bp),%xmm10
971 movdqa
16*3($bp),%xmm11
973 movdqa
16*4($bp),%xmm12
975 movdqa
16*5($bp),%xmm13
977 movdqa
16*6($bp),%xmm14
979 movdqa
16*7($bp),%xmm15
993 pshufd \
$0x4e,%xmm8,%xmm9
996 $code.=<<___
if ($addx);
998 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
999 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
1005 movq
$n0, 128(%rsp) # off-load arguments
1006 movq
$out, 128+8(%rsp)
1007 movq
$mod, 128+16(%rsp)
1011 mulq
%rbx # 0 iteration
1060 jmp
.Loop_mul_gather
1064 movdqa
16*0(%rbp),%xmm8
1065 movdqa
16*1(%rbp),%xmm9
1066 movdqa
16*2(%rbp),%xmm10
1067 movdqa
16*3(%rbp),%xmm11
1069 movdqa
16*4(%rbp),%xmm12
1071 movdqa
16*5(%rbp),%xmm13
1073 movdqa
16*6(%rbp),%xmm14
1075 movdqa
16*7(%rbp),%xmm15
1076 leaq
128(%rbp), %rbp
1089 pshufd \
$0x4e,%xmm8,%xmm9
1159 jnz
.Loop_mul_gather
1170 movq
128+8(%rsp), $out
1171 movq
128+16(%rsp), %rbp
1182 call __rsaz_512_reduce
1184 $code.=<<___
if ($addx);
1185 jmp
.Lmul_gather_tail
1191 mov
$n0, 128(%rsp) # off-load arguments
1192 mov
$out, 128+8(%rsp)
1193 mov
$mod, 128+16(%rsp)
1195 mulx
($ap), %rbx, %r8 # 0 iteration
1197 xor %edi, %edi # cf=0, of=0
1199 mulx
8($ap), %rax, %r9
1201 mulx
16($ap), %rbx, %r10
1204 mulx
24($ap), %rax, %r11
1207 mulx
32($ap), %rbx, %r12
1210 mulx
40($ap), %rax, %r13
1213 mulx
48($ap), %rbx, %r14
1216 mulx
56($ap), %rax, %r15
1221 adcx
%rdi, %r15 # %rdi is 0
1224 jmp
.Loop_mulx_gather
1228 movdqa
16*0(%rbp),%xmm8
1229 movdqa
16*1(%rbp),%xmm9
1230 movdqa
16*2(%rbp),%xmm10
1231 movdqa
16*3(%rbp),%xmm11
1233 movdqa
16*4(%rbp),%xmm12
1235 movdqa
16*5(%rbp),%xmm13
1237 movdqa
16*6(%rbp),%xmm14
1239 movdqa
16*7(%rbp),%xmm15
1240 leaq
128(%rbp), %rbp
1253 pshufd \
$0x4e,%xmm8,%xmm9
1257 .byte
0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
1261 mulx
8($ap), %rax, %r9
1265 mulx
16($ap), %rax, %r10
1269 .byte
0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1273 mulx
32($ap), %rax, %r12
1277 mulx
40($ap), %rax, %r13
1281 .byte
0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1286 mulx
56($ap), %rax, %r15
1287 mov
%rbx, 64(%rsp,%rcx,8)
1291 adcx
%rdi, %r15 # cf=0
1294 jnz
.Loop_mulx_gather
1298 mov
%r10, 64+16(%rsp)
1299 mov
%r11, 64+24(%rsp)
1300 mov
%r12, 64+32(%rsp)
1301 mov
%r13, 64+40(%rsp)
1302 mov
%r14, 64+48(%rsp)
1303 mov
%r15, 64+56(%rsp)
1305 mov
128(%rsp), %rdx # pull arguments
1306 mov
128+8(%rsp), $out
1307 mov
128+16(%rsp), %rbp
1318 call __rsaz_512_reducex
1328 adcq
104(%rsp), %r13
1329 adcq
112(%rsp), %r14
1330 adcq
120(%rsp), %r15
1333 call __rsaz_512_subtract
1335 leaq
128+24+48(%rsp), %rax
1337 $code.=<<___
if ($win64);
1338 movaps
0xa0-0xc8(%rax),%xmm6
1339 movaps
0xb0-0xc8(%rax),%xmm7
1340 movaps
0xc0-0xc8(%rax),%xmm8
1341 movaps
0xd0-0xc8(%rax),%xmm9
1342 movaps
0xe0-0xc8(%rax),%xmm10
1343 movaps
0xf0-0xc8(%rax),%xmm11
1344 movaps
0x100-0xc8(%rax),%xmm12
1345 movaps
0x110-0xc8(%rax),%xmm13
1346 movaps
0x120-0xc8(%rax),%xmm14
1347 movaps
0x130-0xc8(%rax),%xmm15
1351 movq
-48(%rax), %r15
1352 movq
-40(%rax), %r14
1353 movq
-32(%rax), %r13
1354 movq
-24(%rax), %r12
1355 movq
-16(%rax), %rbp
1358 .Lmul_gather4_epilogue
:
1360 .size rsaz_512_mul_gather4
,.-rsaz_512_mul_gather4
1364 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1366 .globl rsaz_512_mul_scatter4
1367 .type rsaz_512_mul_scatter4
,\
@function,6
1369 rsaz_512_mul_scatter4
:
1379 .Lmul_scatter4_body
:
1380 leaq
($tbl,$pwr,8), $tbl
1381 movq
$out, %xmm0 # off-load arguments
1388 $code.=<<___
if ($addx);
1389 movl \
$0x80100,%r11d
1390 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
1391 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
1395 movq
($out),%rbx # pass b[0]
1410 call __rsaz_512_reduce
1412 $code.=<<___
if ($addx);
1413 jmp
.Lmul_scatter_tail
1417 movq
($out), %rdx # pass b[0]
1418 call __rsaz_512_mulx
1423 movq
128(%rsp), %rdx # pull $n0
1433 call __rsaz_512_reducex
1443 adcq
104(%rsp), %r13
1444 adcq
112(%rsp), %r14
1445 adcq
120(%rsp), %r15
1449 call __rsaz_512_subtract
1451 movq
%r8, 128*0($inp) # scatter
1452 movq
%r9, 128*1($inp)
1453 movq
%r10, 128*2($inp)
1454 movq
%r11, 128*3($inp)
1455 movq
%r12, 128*4($inp)
1456 movq
%r13, 128*5($inp)
1457 movq
%r14, 128*6($inp)
1458 movq
%r15, 128*7($inp)
1460 leaq
128+24+48(%rsp), %rax
1461 movq
-48(%rax), %r15
1462 movq
-40(%rax), %r14
1463 movq
-32(%rax), %r13
1464 movq
-24(%rax), %r12
1465 movq
-16(%rax), %rbp
1468 .Lmul_scatter4_epilogue
:
1470 .size rsaz_512_mul_scatter4
,.-rsaz_512_mul_scatter4
1474 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1476 .globl rsaz_512_mul_by_one
1477 .type rsaz_512_mul_by_one
,\
@function,4
1479 rsaz_512_mul_by_one
:
1490 $code.=<<___
if ($addx);
1491 movl OPENSSL_ia32cap_P
+8(%rip),%eax
1494 movq
$mod, %rbp # reassign argument
1507 movdqa
%xmm0, (%rsp)
1508 movdqa
%xmm0, 16(%rsp)
1509 movdqa
%xmm0, 32(%rsp)
1510 movdqa
%xmm0, 48(%rsp)
1511 movdqa
%xmm0, 64(%rsp)
1512 movdqa
%xmm0, 80(%rsp)
1513 movdqa
%xmm0, 96(%rsp)
1515 $code.=<<___
if ($addx);
1517 cmpl \
$0x80100,%eax # check for MULX and ADO/CX
1521 call __rsaz_512_reduce
1523 $code.=<<___
if ($addx);
1527 movq
128(%rsp), %rdx # pull $n0
1528 call __rsaz_512_reducex
1541 leaq
128+24+48(%rsp), %rax
1542 movq
-48(%rax), %r15
1543 movq
-40(%rax), %r14
1544 movq
-32(%rax), %r13
1545 movq
-24(%rax), %r12
1546 movq
-16(%rax), %rbp
1549 .Lmul_by_one_epilogue
:
1551 .size rsaz_512_mul_by_one
,.-rsaz_512_mul_by_one
1554 { # __rsaz_512_reduce
1556 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1558 # clobbers: everything except %rbp and %rdi
1560 .type __rsaz_512_reduce
,\
@abi-omnipotent
1564 imulq
128+8(%rsp), %rbx
1567 jmp
.Lreduction_loop
1598 movq
128+8(%rsp), %rsi
1639 jne
.Lreduction_loop
1642 .size __rsaz_512_reduce
,.-__rsaz_512_reduce
1646 # __rsaz_512_reducex
1648 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1650 # clobbers: everything except %rbp and %rdi
1652 .type __rsaz_512_reducex
,\
@abi-omnipotent
1655 #movq 128+8(%rsp), %rdx # pull $n0
1657 xorq
%rsi, %rsi # cf=0,of=0
1659 jmp
.Lreduction_loopx
1664 mulx
0(%rbp), %rax, %r8
1668 mulx
8(%rbp), %rax, %r9
1672 mulx
16(%rbp), %rbx, %r10
1676 mulx
24(%rbp), %rbx, %r11
1680 .byte
0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1686 mulx
128+8(%rsp), %rbx, %rdx
1689 mulx
40(%rbp), %rax, %r13
1693 .byte
0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1697 mulx
56(%rbp), %rax, %r15
1700 adox
%rsi, %r15 # %rsi is 0
1701 adcx
%rsi, %r15 # cf=0
1704 jne
.Lreduction_loopx
1707 .size __rsaz_512_reducex
,.-__rsaz_512_reducex
1710 { # __rsaz_512_subtract
1711 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1713 # clobbers: everything but %rdi, %rsi and %rbp
1715 .type __rsaz_512_subtract
,\
@abi-omnipotent
1717 __rsaz_512_subtract
:
1771 .size __rsaz_512_subtract
,.-__rsaz_512_subtract
1776 # input: %rsi - ap, %rbp - bp
1778 # clobbers: everything
1779 my ($ap,$bp) = ("%rsi","%rbp");
1781 .type __rsaz_512_mul
,\
@abi-omnipotent
1922 .size __rsaz_512_mul
,.-__rsaz_512_mul
1928 # input: %rsi - ap, %rbp - bp
1930 # clobbers: everything
1931 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1933 .type __rsaz_512_mulx
,\
@abi-omnipotent
1936 mulx
($ap), %rbx, %r8 # initial %rdx preloaded by caller
1939 mulx
8($ap), %rax, %r9
1942 mulx
16($ap), %rbx, %r10
1945 mulx
24($ap), %rax, %r11
1948 mulx
32($ap), %rbx, %r12
1951 mulx
40($ap), %rax, %r13
1954 mulx
48($ap), %rbx, %r14
1957 mulx
56($ap), %rax, %r15
1963 xor $zero, $zero # cf=0,of=0
1969 mulx
($ap), %rax, %r8
1973 mulx
8($ap), %rax, %r9
1977 mulx
16($ap), %rax, %r10
1981 mulx
24($ap), %rax, %r11
1985 .byte
0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1989 mulx
40($ap), %rax, %r13
1993 mulx
48($ap), %rax, %r14
1997 mulx
56($ap), %rax, %r15
1998 movq
64($bp,%rcx,8), %rdx
1999 movq
%rbx, 8+64-8(%rsp,%rcx,8)
2002 adcx
$zero, %r15 # cf=0
2008 mulx
($ap), %rax, %r8
2012 .byte
0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
2016 .byte
0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
2020 mulx
24($ap), %rax, %r11
2024 mulx
32($ap), %rax, %r12
2028 mulx
40($ap), %rax, %r13
2032 .byte
0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
2036 .byte
0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
2041 mov
%rbx, 8+64-8(%rsp)
2043 mov
%r9, 8+64+8(%rsp)
2044 mov
%r10, 8+64+16(%rsp)
2045 mov
%r11, 8+64+24(%rsp)
2046 mov
%r12, 8+64+32(%rsp)
2047 mov
%r13, 8+64+40(%rsp)
2048 mov
%r14, 8+64+48(%rsp)
2049 mov
%r15, 8+64+56(%rsp)
2052 .size __rsaz_512_mulx
,.-__rsaz_512_mulx
2056 my ($out,$inp,$power)= $win64 ?
("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2058 .globl rsaz_512_scatter4
2059 .type rsaz_512_scatter4
,\
@abi-omnipotent
2062 leaq
($out,$power,8), $out
2070 leaq
128($out), $out
2074 .size rsaz_512_scatter4
,.-rsaz_512_scatter4
2076 .globl rsaz_512_gather4
2077 .type rsaz_512_gather4
,\
@abi-omnipotent
2081 $code.=<<___
if ($win64);
2082 .LSEH_begin_rsaz_512_gather4
:
2083 .byte
0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
2084 .byte
0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
2085 .byte
0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
2086 .byte
0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
2087 .byte
0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
2088 .byte
0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
2089 .byte
0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
2090 .byte
0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
2091 .byte
0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
2092 .byte
0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
2093 .byte
0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
2097 movdqa
.Linc
+16(%rip),%xmm1 # 00000002000000020000000200000002
2098 movdqa
.Linc
(%rip),%xmm0 # 00000001000000010000000000000000
2100 pshufd \
$0,%xmm8,%xmm8 # broadcast $power
2104 ########################################################################
2105 # calculate mask by comparing 0..15 to $power
2107 for($i=0;$i<4;$i++) {
2109 paddd
%xmm`$i`,%xmm`$i+1`
2110 pcmpeqd
%xmm8,%xmm`$i`
2111 movdqa
%xmm7,%xmm`$i+3`
2116 paddd
%xmm`$i`,%xmm`$i+1`
2117 pcmpeqd
%xmm8,%xmm`$i`
2126 movdqa
16*0($inp),%xmm8
2127 movdqa
16*1($inp),%xmm9
2128 movdqa
16*2($inp),%xmm10
2129 movdqa
16*3($inp),%xmm11
2131 movdqa
16*4($inp),%xmm12
2133 movdqa
16*5($inp),%xmm13
2135 movdqa
16*6($inp),%xmm14
2137 movdqa
16*7($inp),%xmm15
2138 leaq
128($inp), $inp
2151 pshufd \
$0x4e,%xmm8,%xmm9
2158 $code.=<<___
if ($win64);
2159 movaps
0x00(%rsp),%xmm6
2160 movaps
0x10(%rsp),%xmm7
2161 movaps
0x20(%rsp),%xmm8
2162 movaps
0x30(%rsp),%xmm9
2163 movaps
0x40(%rsp),%xmm10
2164 movaps
0x50(%rsp),%xmm11
2165 movaps
0x60(%rsp),%xmm12
2166 movaps
0x70(%rsp),%xmm13
2167 movaps
0x80(%rsp),%xmm14
2168 movaps
0x90(%rsp),%xmm15
2173 .LSEH_end_rsaz_512_gather4
:
2174 .size rsaz_512_gather4
,.-rsaz_512_gather4
2183 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2184 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2192 .extern __imp_RtlVirtualUnwind
2193 .type se_handler
,\
@abi-omnipotent
2207 mov
120($context),%rax # pull context->Rax
2208 mov
248($context),%rbx # pull context->Rip
2210 mov
8($disp),%rsi # disp->ImageBase
2211 mov
56($disp),%r11 # disp->HandlerData
2213 mov
0(%r11),%r10d # HandlerData[0]
2214 lea
(%rsi,%r10),%r10 # end of prologue label
2215 cmp %r10,%rbx # context->Rip<end of prologue label
2216 jb
.Lcommon_seh_tail
2218 mov
152($context),%rax # pull context->Rsp
2220 mov
4(%r11),%r10d # HandlerData[1]
2221 lea
(%rsi,%r10),%r10 # epilogue label
2222 cmp %r10,%rbx # context->Rip>=epilogue label
2223 jae
.Lcommon_seh_tail
2225 lea
128+24+48(%rax),%rax
2227 lea
.Lmul_gather4_epilogue
(%rip),%rbx
2229 jne
.Lse_not_in_mul_gather4
2233 lea
-48-0xa8(%rax),%rsi
2234 lea
512($context),%rdi
2236 .long
0xa548f3fc # cld; rep movsq
2238 .Lse_not_in_mul_gather4
:
2245 mov
%rbx,144($context) # restore context->Rbx
2246 mov
%rbp,160($context) # restore context->Rbp
2247 mov
%r12,216($context) # restore context->R12
2248 mov
%r13,224($context) # restore context->R13
2249 mov
%r14,232($context) # restore context->R14
2250 mov
%r15,240($context) # restore context->R15
2255 mov
%rax,152($context) # restore context->Rsp
2256 mov
%rsi,168($context) # restore context->Rsi
2257 mov
%rdi,176($context) # restore context->Rdi
2259 mov
40($disp),%rdi # disp->ContextRecord
2260 mov
$context,%rsi # context
2261 mov \
$154,%ecx # sizeof(CONTEXT)
2262 .long
0xa548f3fc # cld; rep movsq
2265 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2266 mov
8(%rsi),%rdx # arg2, disp->ImageBase
2267 mov
0(%rsi),%r8 # arg3, disp->ControlPc
2268 mov
16(%rsi),%r9 # arg4, disp->FunctionEntry
2269 mov
40(%rsi),%r10 # disp->ContextRecord
2270 lea
56(%rsi),%r11 # &disp->HandlerData
2271 lea
24(%rsi),%r12 # &disp->EstablisherFrame
2272 mov
%r10,32(%rsp) # arg5
2273 mov
%r11,40(%rsp) # arg6
2274 mov
%r12,48(%rsp) # arg7
2275 mov
%rcx,56(%rsp) # arg8, (NULL)
2276 call
*__imp_RtlVirtualUnwind
(%rip)
2278 mov \
$1,%eax # ExceptionContinueSearch
2290 .size se_handler
,.-se_handler
2294 .rva
.LSEH_begin_rsaz_512_sqr
2295 .rva
.LSEH_end_rsaz_512_sqr
2296 .rva
.LSEH_info_rsaz_512_sqr
2298 .rva
.LSEH_begin_rsaz_512_mul
2299 .rva
.LSEH_end_rsaz_512_mul
2300 .rva
.LSEH_info_rsaz_512_mul
2302 .rva
.LSEH_begin_rsaz_512_mul_gather4
2303 .rva
.LSEH_end_rsaz_512_mul_gather4
2304 .rva
.LSEH_info_rsaz_512_mul_gather4
2306 .rva
.LSEH_begin_rsaz_512_mul_scatter4
2307 .rva
.LSEH_end_rsaz_512_mul_scatter4
2308 .rva
.LSEH_info_rsaz_512_mul_scatter4
2310 .rva
.LSEH_begin_rsaz_512_mul_by_one
2311 .rva
.LSEH_end_rsaz_512_mul_by_one
2312 .rva
.LSEH_info_rsaz_512_mul_by_one
2314 .rva
.LSEH_begin_rsaz_512_gather4
2315 .rva
.LSEH_end_rsaz_512_gather4
2316 .rva
.LSEH_info_rsaz_512_gather4
2320 .LSEH_info_rsaz_512_sqr
:
2323 .rva
.Lsqr_body
,.Lsqr_epilogue
# HandlerData[]
2324 .LSEH_info_rsaz_512_mul
:
2327 .rva
.Lmul_body
,.Lmul_epilogue
# HandlerData[]
2328 .LSEH_info_rsaz_512_mul_gather4
:
2331 .rva
.Lmul_gather4_body
,.Lmul_gather4_epilogue
# HandlerData[]
2332 .LSEH_info_rsaz_512_mul_scatter4
:
2335 .rva
.Lmul_scatter4_body
,.Lmul_scatter4_epilogue
# HandlerData[]
2336 .LSEH_info_rsaz_512_mul_by_one
:
2339 .rva
.Lmul_by_one_body
,.Lmul_by_one_epilogue
# HandlerData[]
2340 .LSEH_info_rsaz_512_gather4
:
2341 .byte
0x01,0x46,0x16,0x00
2342 .byte
0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
2343 .byte
0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
2344 .byte
0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
2345 .byte
0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
2346 .byte
0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
2347 .byte
0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
2348 .byte
0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
2349 .byte
0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
2350 .byte
0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
2351 .byte
0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
2352 .byte
0x07,0x01,0x15,0x00 # sub rsp,0xa8
2356 $code =~ s/\`([^\`]*)\`/eval $1/gem;