2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 # Copyright (c) 2012, Intel Corporation. All Rights Reserved.
5 # Licensed under the OpenSSL license (the "License"). You may not use
6 # this file except in compliance with the License. You can obtain a copy
7 # in the file LICENSE in the source distribution or at
8 # https://www.openssl.org/source/license.html
10 # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
11 # (1) Intel Corporation, Israel Development Center, Haifa, Israel
12 # (2) University of Haifa, Israel
15 # [1] S. Gueron, "Efficient Software Implementations of Modular
16 # Exponentiation", http://eprint.iacr.org/2011/239
17 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".
18 # IEEE Proceedings of 9th International Conference on Information
19 # Technology: New Generations (ITNG 2012), 821-823 (2012).
20 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation
21 # Journal of Cryptographic Engineering 2:31-43 (2012).
22 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
23 # resistant 512-bit and 1024-bit modular exponentiation for optimizing
24 # RSA1024 and RSA2048 on x86_64 platforms",
25 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest
27 # While original submission covers 512- and 1024-bit exponentiation,
28 # this module is limited to 512-bit version only (and as such
29 # accelerates RSA1024 sign). This is because improvement for longer
30 # keys is not high enough to justify the effort, highest measured
31 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
32 # for the moment of this writing!] Nor does this module implement
33 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
34 # to more modular mixture of C and assembly. And it's optimized even
35 # for processors other than Intel Core family (see table below for
36 # improvement coefficients).
39 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
40 # ----------------+---------------------------
41 # Opteron +13% |+5% +20%
42 # Bulldozer -0% |-1% +10%
44 # Westmere +5% |+14% +17%
45 # Sandy Bridge +2% |+12% +29%
46 # Ivy Bridge +1% |+11% +35%
47 # Haswell(**) -0% |+12% +39%
49 # VIA Nano +70% |+9% +25%
51 # (*) rsax engine and fips numbers are presented for reference
53 # (**) MULX was attempted, but found to give only marginal improvement;
57 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
59 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
61 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
62 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
63 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f
$xlate) or
64 die "can't locate x86_64-xlate.pl";
66 open OUT
,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
69 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
70 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
74 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM
} =~ /nasm/) &&
75 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
79 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM
} =~ /ml64/) &&
80 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
84 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
85 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
89 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
91 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
96 .extern OPENSSL_ia32cap_P
99 .type rsaz_512_sqr
,\
@function,5
101 rsaz_512_sqr
: # 25-29% faster than rsaz_512_mul
117 .cfi_adjust_cfa_offset
128+24
119 movq
$mod, %xmm1 # common off-load
124 $code.=<<___
if ($addx);
126 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
127 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
135 movl
$times,128+8(%rsp)
137 movq
%rdx, %rbx # 0($inp)
138 mov
%rax, %rbp # 8($inp)
179 xorq
%rcx,%rcx # rcx:r8 = r8 << 1
238 xorq
%rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
245 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
294 xorq
%rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
301 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
313 mov
%rax, %r11 # 32($inp)
320 mov
%rax, %r12 # 40($inp)
329 mov
%rax, %rbp # 48($inp)
345 xorq
%rbx, %rbx # rbx:r13:r14 = r13:r14 << 1
352 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
355 movq
%r12, %rax # 40($inp)
365 movq
%rbp, %rax # 48($inp)
377 mov
%rax, %r14 # 56($inp)
385 xorq
%rcx, %rcx # rcx:r8:r15 = r8:r15 << 1
392 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
395 movq
%rbp, %rax # 48($inp)
405 movq
%r14, %rax # 56($inp)
416 xorq
%rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
423 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
426 movq
%r14, %rax # 56($inp)
439 xorq
%rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
446 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
449 movq
%r14, %rax # 56($inp)
457 xorq
%rbx, %rbx # rbx:r13 = r13 << 1
462 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
480 call __rsaz_512_reduce
492 call __rsaz_512_subtract
496 movl
128+8(%rsp), $times
508 movl
$times,128+8(%rsp)
509 movq
$out, %xmm0 # off-load
514 mulx
16($inp), %rcx, %r10
515 xor %rbp, %rbp # cf=0, of=0
517 mulx
24($inp), %rax, %r11
520 .byte
0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12
523 .byte
0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13
526 mulx
48($inp), %rcx, %r14
530 mulx
56($inp), %rax, %r15
532 adcx
%rbp, %r15 # %rbp is 0
534 mulx
%rdx, %rax, $out
535 mov
%rbx, %rdx # 8($inp)
546 .byte
0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx
550 mulx
24($inp), $out, %r8
555 mulx
32($inp), %rax, %rbx
559 mulx
40($inp), $out, %r8
563 .byte
0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
567 .byte
0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
570 mulx
%rdx, %rax, $out
572 .byte
0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx
576 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
585 .byte
0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
588 mulx
24($inp), $out, %r9
592 mulx
32($inp), %rax, %rcx
596 .byte
0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9
600 .byte
0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
604 mulx
56($inp), $out, %r9
607 mulx
%rdx, %rax, $out
613 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
625 mulx
32($inp), %rax, %rbx
629 mulx
40($inp), $out, %r10
633 mulx
48($inp), %rax, %rbx
637 mulx
56($inp), $out, %r10
640 mulx
%rdx, %rax, $out
646 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
658 mulx
40($inp), $out, %r11
662 mulx
48($inp), %rax, %rcx
666 mulx
56($inp), $out, %r11
669 mulx
%rdx, %rax, $out
675 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
687 .byte
0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
691 .byte
0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
694 mulx
%rdx, %rax, $out
700 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
712 .byte
0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
716 mulx
%rdx, %rax, $out
720 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
728 .byte
0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
729 .byte
0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
732 mulx
%rdx, %rax, %rdx
735 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
744 movq
128(%rsp), %rdx # pull $n0
757 call __rsaz_512_reducex
769 call __rsaz_512_subtract
773 movl
128+8(%rsp), $times
784 leaq
128+24+48(%rsp), %rax
799 .cfi_def_cfa_register
%rsp
803 .size rsaz_512_sqr
,.-rsaz_512_sqr
807 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
810 .type rsaz_512_mul
,\
@function,5
828 .cfi_adjust_cfa_offset
128+24
830 movq
$out, %xmm0 # off-load arguments
834 $code.=<<___
if ($addx);
836 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
837 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
841 movq
($bp), %rbx # pass b[0]
842 movq
$bp, %rbp # pass argument
857 call __rsaz_512_reduce
859 $code.=<<___
if ($addx);
864 movq
$bp, %rbp # pass argument
865 movq
($bp), %rdx # pass b[0]
871 movq
128(%rsp), %rdx # pull $n0
881 call __rsaz_512_reducex
895 call __rsaz_512_subtract
897 leaq
128+24+48(%rsp), %rax
912 .cfi_def_cfa_register
%rsp
916 .size rsaz_512_mul
,.-rsaz_512_mul
920 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
922 .globl rsaz_512_mul_gather4
923 .type rsaz_512_mul_gather4
,\
@function,6
925 rsaz_512_mul_gather4
:
940 subq \
$`128+24+($win64?0xb0:0)`, %rsp
941 .cfi_adjust_cfa_offset
`128+24+($win64?0xb0:0)`
943 $code.=<<___
if ($win64);
944 movaps
%xmm6,0xa0(%rsp)
945 movaps
%xmm7,0xb0(%rsp)
946 movaps
%xmm8,0xc0(%rsp)
947 movaps
%xmm9,0xd0(%rsp)
948 movaps
%xmm10,0xe0(%rsp)
949 movaps
%xmm11,0xf0(%rsp)
950 movaps
%xmm12,0x100(%rsp)
951 movaps
%xmm13,0x110(%rsp)
952 movaps
%xmm14,0x120(%rsp)
953 movaps
%xmm15,0x130(%rsp)
958 movdqa
.Linc
+16(%rip),%xmm1 # 00000002000000020000000200000002
959 movdqa
.Linc
(%rip),%xmm0 # 00000001000000010000000000000000
961 pshufd \
$0,%xmm8,%xmm8 # broadcast $power
965 ########################################################################
966 # calculate mask by comparing 0..15 to $power
968 for($i=0;$i<4;$i++) {
970 paddd
%xmm`$i`,%xmm`$i+1`
971 pcmpeqd
%xmm8,%xmm`$i`
972 movdqa
%xmm7,%xmm`$i+3`
977 paddd
%xmm`$i`,%xmm`$i+1`
978 pcmpeqd
%xmm8,%xmm`$i`
984 movdqa
16*0($bp),%xmm8
985 movdqa
16*1($bp),%xmm9
986 movdqa
16*2($bp),%xmm10
987 movdqa
16*3($bp),%xmm11
989 movdqa
16*4($bp),%xmm12
991 movdqa
16*5($bp),%xmm13
993 movdqa
16*6($bp),%xmm14
995 movdqa
16*7($bp),%xmm15
1009 pshufd \
$0x4e,%xmm8,%xmm9
1012 $code.=<<___
if ($addx);
1013 movl \
$0x80100,%r11d
1014 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
1015 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
1021 movq
$n0, 128(%rsp) # off-load arguments
1022 movq
$out, 128+8(%rsp)
1023 movq
$mod, 128+16(%rsp)
1027 mulq
%rbx # 0 iteration
1076 jmp
.Loop_mul_gather
1080 movdqa
16*0(%rbp),%xmm8
1081 movdqa
16*1(%rbp),%xmm9
1082 movdqa
16*2(%rbp),%xmm10
1083 movdqa
16*3(%rbp),%xmm11
1085 movdqa
16*4(%rbp),%xmm12
1087 movdqa
16*5(%rbp),%xmm13
1089 movdqa
16*6(%rbp),%xmm14
1091 movdqa
16*7(%rbp),%xmm15
1092 leaq
128(%rbp), %rbp
1105 pshufd \
$0x4e,%xmm8,%xmm9
1175 jnz
.Loop_mul_gather
1186 movq
128+8(%rsp), $out
1187 movq
128+16(%rsp), %rbp
1198 call __rsaz_512_reduce
1200 $code.=<<___
if ($addx);
1201 jmp
.Lmul_gather_tail
1207 mov
$n0, 128(%rsp) # off-load arguments
1208 mov
$out, 128+8(%rsp)
1209 mov
$mod, 128+16(%rsp)
1211 mulx
($ap), %rbx, %r8 # 0 iteration
1213 xor %edi, %edi # cf=0, of=0
1215 mulx
8($ap), %rax, %r9
1217 mulx
16($ap), %rbx, %r10
1220 mulx
24($ap), %rax, %r11
1223 mulx
32($ap), %rbx, %r12
1226 mulx
40($ap), %rax, %r13
1229 mulx
48($ap), %rbx, %r14
1232 mulx
56($ap), %rax, %r15
1237 adcx
%rdi, %r15 # %rdi is 0
1240 jmp
.Loop_mulx_gather
1244 movdqa
16*0(%rbp),%xmm8
1245 movdqa
16*1(%rbp),%xmm9
1246 movdqa
16*2(%rbp),%xmm10
1247 movdqa
16*3(%rbp),%xmm11
1249 movdqa
16*4(%rbp),%xmm12
1251 movdqa
16*5(%rbp),%xmm13
1253 movdqa
16*6(%rbp),%xmm14
1255 movdqa
16*7(%rbp),%xmm15
1256 leaq
128(%rbp), %rbp
1269 pshufd \
$0x4e,%xmm8,%xmm9
1273 .byte
0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
1277 mulx
8($ap), %rax, %r9
1281 mulx
16($ap), %rax, %r10
1285 .byte
0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1289 mulx
32($ap), %rax, %r12
1293 mulx
40($ap), %rax, %r13
1297 .byte
0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1302 mulx
56($ap), %rax, %r15
1303 mov
%rbx, 64(%rsp,%rcx,8)
1307 adcx
%rdi, %r15 # cf=0
1310 jnz
.Loop_mulx_gather
1314 mov
%r10, 64+16(%rsp)
1315 mov
%r11, 64+24(%rsp)
1316 mov
%r12, 64+32(%rsp)
1317 mov
%r13, 64+40(%rsp)
1318 mov
%r14, 64+48(%rsp)
1319 mov
%r15, 64+56(%rsp)
1321 mov
128(%rsp), %rdx # pull arguments
1322 mov
128+8(%rsp), $out
1323 mov
128+16(%rsp), %rbp
1334 call __rsaz_512_reducex
1344 adcq
104(%rsp), %r13
1345 adcq
112(%rsp), %r14
1346 adcq
120(%rsp), %r15
1349 call __rsaz_512_subtract
1351 leaq
128+24+48(%rsp), %rax
1353 $code.=<<___
if ($win64);
1354 movaps
0xa0-0xc8(%rax),%xmm6
1355 movaps
0xb0-0xc8(%rax),%xmm7
1356 movaps
0xc0-0xc8(%rax),%xmm8
1357 movaps
0xd0-0xc8(%rax),%xmm9
1358 movaps
0xe0-0xc8(%rax),%xmm10
1359 movaps
0xf0-0xc8(%rax),%xmm11
1360 movaps
0x100-0xc8(%rax),%xmm12
1361 movaps
0x110-0xc8(%rax),%xmm13
1362 movaps
0x120-0xc8(%rax),%xmm14
1363 movaps
0x130-0xc8(%rax),%xmm15
1368 movq
-48(%rax), %r15
1370 movq
-40(%rax), %r14
1372 movq
-32(%rax), %r13
1374 movq
-24(%rax), %r12
1376 movq
-16(%rax), %rbp
1381 .cfi_def_cfa_register
%rsp
1382 .Lmul_gather4_epilogue
:
1385 .size rsaz_512_mul_gather4
,.-rsaz_512_mul_gather4
1389 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1391 .globl rsaz_512_mul_scatter4
1392 .type rsaz_512_mul_scatter4
,\
@function,6
1394 rsaz_512_mul_scatter4
:
1411 .cfi_adjust_cfa_offset
128+24
1412 .Lmul_scatter4_body
:
1413 leaq
($tbl,$pwr,8), $tbl
1414 movq
$out, %xmm0 # off-load arguments
1421 $code.=<<___
if ($addx);
1422 movl \
$0x80100,%r11d
1423 andl OPENSSL_ia32cap_P
+8(%rip),%r11d
1424 cmpl \
$0x80100,%r11d # check for MULX and ADO/CX
1428 movq
($out),%rbx # pass b[0]
1443 call __rsaz_512_reduce
1445 $code.=<<___
if ($addx);
1446 jmp
.Lmul_scatter_tail
1450 movq
($out), %rdx # pass b[0]
1451 call __rsaz_512_mulx
1456 movq
128(%rsp), %rdx # pull $n0
1466 call __rsaz_512_reducex
1476 adcq
104(%rsp), %r13
1477 adcq
112(%rsp), %r14
1478 adcq
120(%rsp), %r15
1482 call __rsaz_512_subtract
1484 movq
%r8, 128*0($inp) # scatter
1485 movq
%r9, 128*1($inp)
1486 movq
%r10, 128*2($inp)
1487 movq
%r11, 128*3($inp)
1488 movq
%r12, 128*4($inp)
1489 movq
%r13, 128*5($inp)
1490 movq
%r14, 128*6($inp)
1491 movq
%r15, 128*7($inp)
1493 leaq
128+24+48(%rsp), %rax
1495 movq
-48(%rax), %r15
1497 movq
-40(%rax), %r14
1499 movq
-32(%rax), %r13
1501 movq
-24(%rax), %r12
1503 movq
-16(%rax), %rbp
1508 .cfi_def_cfa_register
%rsp
1509 .Lmul_scatter4_epilogue
:
1512 .size rsaz_512_mul_scatter4
,.-rsaz_512_mul_scatter4
1516 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1518 .globl rsaz_512_mul_by_one
1519 .type rsaz_512_mul_by_one
,\
@function,4
1521 rsaz_512_mul_by_one
:
1537 .cfi_adjust_cfa_offset
128+24
1540 $code.=<<___
if ($addx);
1541 movl OPENSSL_ia32cap_P
+8(%rip),%eax
1544 movq
$mod, %rbp # reassign argument
1557 movdqa
%xmm0, (%rsp)
1558 movdqa
%xmm0, 16(%rsp)
1559 movdqa
%xmm0, 32(%rsp)
1560 movdqa
%xmm0, 48(%rsp)
1561 movdqa
%xmm0, 64(%rsp)
1562 movdqa
%xmm0, 80(%rsp)
1563 movdqa
%xmm0, 96(%rsp)
1565 $code.=<<___
if ($addx);
1567 cmpl \
$0x80100,%eax # check for MULX and ADO/CX
1571 call __rsaz_512_reduce
1573 $code.=<<___
if ($addx);
1577 movq
128(%rsp), %rdx # pull $n0
1578 call __rsaz_512_reducex
1591 leaq
128+24+48(%rsp), %rax
1593 movq
-48(%rax), %r15
1595 movq
-40(%rax), %r14
1597 movq
-32(%rax), %r13
1599 movq
-24(%rax), %r12
1601 movq
-16(%rax), %rbp
1606 .cfi_def_cfa_register
%rsp
1607 .Lmul_by_one_epilogue
:
1610 .size rsaz_512_mul_by_one
,.-rsaz_512_mul_by_one
1613 { # __rsaz_512_reduce
1615 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1617 # clobbers: everything except %rbp and %rdi
1619 .type __rsaz_512_reduce
,\
@abi-omnipotent
1624 imulq
128+8(%rsp), %rbx
1627 jmp
.Lreduction_loop
1658 movq
128+8(%rsp), %rsi
1699 jne
.Lreduction_loop
1703 .size __rsaz_512_reduce
,.-__rsaz_512_reduce
1707 # __rsaz_512_reducex
1709 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1711 # clobbers: everything except %rbp and %rdi
1713 .type __rsaz_512_reducex
,\
@abi-omnipotent
1717 #movq 128+8(%rsp), %rdx # pull $n0
1719 xorq
%rsi, %rsi # cf=0,of=0
1721 jmp
.Lreduction_loopx
1726 mulx
0(%rbp), %rax, %r8
1730 mulx
8(%rbp), %rax, %r9
1734 mulx
16(%rbp), %rbx, %r10
1738 mulx
24(%rbp), %rbx, %r11
1742 .byte
0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1748 mulx
128+8(%rsp), %rbx, %rdx
1751 mulx
40(%rbp), %rax, %r13
1755 .byte
0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1759 mulx
56(%rbp), %rax, %r15
1762 adox
%rsi, %r15 # %rsi is 0
1763 adcx
%rsi, %r15 # cf=0
1766 jne
.Lreduction_loopx
1770 .size __rsaz_512_reducex
,.-__rsaz_512_reducex
1773 { # __rsaz_512_subtract
1774 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1776 # clobbers: everything but %rdi, %rsi and %rbp
1778 .type __rsaz_512_subtract
,\
@abi-omnipotent
1780 __rsaz_512_subtract
:
1836 .size __rsaz_512_subtract
,.-__rsaz_512_subtract
1841 # input: %rsi - ap, %rbp - bp
1843 # clobbers: everything
1844 my ($ap,$bp) = ("%rsi","%rbp");
1846 .type __rsaz_512_mul
,\
@abi-omnipotent
1989 .size __rsaz_512_mul
,.-__rsaz_512_mul
1995 # input: %rsi - ap, %rbp - bp
1997 # clobbers: everything
1998 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
2000 .type __rsaz_512_mulx
,\
@abi-omnipotent
2004 mulx
($ap), %rbx, %r8 # initial %rdx preloaded by caller
2007 mulx
8($ap), %rax, %r9
2010 mulx
16($ap), %rbx, %r10
2013 mulx
24($ap), %rax, %r11
2016 mulx
32($ap), %rbx, %r12
2019 mulx
40($ap), %rax, %r13
2022 mulx
48($ap), %rbx, %r14
2025 mulx
56($ap), %rax, %r15
2031 xor $zero, $zero # cf=0,of=0
2037 mulx
($ap), %rax, %r8
2041 mulx
8($ap), %rax, %r9
2045 mulx
16($ap), %rax, %r10
2049 mulx
24($ap), %rax, %r11
2053 .byte
0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
2057 mulx
40($ap), %rax, %r13
2061 mulx
48($ap), %rax, %r14
2065 mulx
56($ap), %rax, %r15
2066 movq
64($bp,%rcx,8), %rdx
2067 movq
%rbx, 8+64-8(%rsp,%rcx,8)
2070 adcx
$zero, %r15 # cf=0
2076 mulx
($ap), %rax, %r8
2080 .byte
0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
2084 .byte
0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
2088 mulx
24($ap), %rax, %r11
2092 mulx
32($ap), %rax, %r12
2096 mulx
40($ap), %rax, %r13
2100 .byte
0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
2104 .byte
0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
2109 mov
%rbx, 8+64-8(%rsp)
2111 mov
%r9, 8+64+8(%rsp)
2112 mov
%r10, 8+64+16(%rsp)
2113 mov
%r11, 8+64+24(%rsp)
2114 mov
%r12, 8+64+32(%rsp)
2115 mov
%r13, 8+64+40(%rsp)
2116 mov
%r14, 8+64+48(%rsp)
2117 mov
%r15, 8+64+56(%rsp)
2121 .size __rsaz_512_mulx
,.-__rsaz_512_mulx
2125 my ($out,$inp,$power)= $win64 ?
("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2127 .globl rsaz_512_scatter4
2128 .type rsaz_512_scatter4
,\
@abi-omnipotent
2132 leaq
($out,$power,8), $out
2140 leaq
128($out), $out
2145 .size rsaz_512_scatter4
,.-rsaz_512_scatter4
2147 .globl rsaz_512_gather4
2148 .type rsaz_512_gather4
,\
@abi-omnipotent
2153 $code.=<<___
if ($win64);
2154 .LSEH_begin_rsaz_512_gather4
:
2155 .byte
0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
2156 .byte
0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
2157 .byte
0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
2158 .byte
0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
2159 .byte
0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
2160 .byte
0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
2161 .byte
0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
2162 .byte
0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
2163 .byte
0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
2164 .byte
0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
2165 .byte
0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
2169 movdqa
.Linc
+16(%rip),%xmm1 # 00000002000000020000000200000002
2170 movdqa
.Linc
(%rip),%xmm0 # 00000001000000010000000000000000
2172 pshufd \
$0,%xmm8,%xmm8 # broadcast $power
2176 ########################################################################
2177 # calculate mask by comparing 0..15 to $power
2179 for($i=0;$i<4;$i++) {
2181 paddd
%xmm`$i`,%xmm`$i+1`
2182 pcmpeqd
%xmm8,%xmm`$i`
2183 movdqa
%xmm7,%xmm`$i+3`
2188 paddd
%xmm`$i`,%xmm`$i+1`
2189 pcmpeqd
%xmm8,%xmm`$i`
2198 movdqa
16*0($inp),%xmm8
2199 movdqa
16*1($inp),%xmm9
2200 movdqa
16*2($inp),%xmm10
2201 movdqa
16*3($inp),%xmm11
2203 movdqa
16*4($inp),%xmm12
2205 movdqa
16*5($inp),%xmm13
2207 movdqa
16*6($inp),%xmm14
2209 movdqa
16*7($inp),%xmm15
2210 leaq
128($inp), $inp
2223 pshufd \
$0x4e,%xmm8,%xmm9
2230 $code.=<<___
if ($win64);
2231 movaps
0x00(%rsp),%xmm6
2232 movaps
0x10(%rsp),%xmm7
2233 movaps
0x20(%rsp),%xmm8
2234 movaps
0x30(%rsp),%xmm9
2235 movaps
0x40(%rsp),%xmm10
2236 movaps
0x50(%rsp),%xmm11
2237 movaps
0x60(%rsp),%xmm12
2238 movaps
0x70(%rsp),%xmm13
2239 movaps
0x80(%rsp),%xmm14
2240 movaps
0x90(%rsp),%xmm15
2245 .LSEH_end_rsaz_512_gather4
:
2247 .size rsaz_512_gather4
,.-rsaz_512_gather4
2256 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2257 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2265 .extern __imp_RtlVirtualUnwind
2266 .type se_handler
,\
@abi-omnipotent
2280 mov
120($context),%rax # pull context->Rax
2281 mov
248($context),%rbx # pull context->Rip
2283 mov
8($disp),%rsi # disp->ImageBase
2284 mov
56($disp),%r11 # disp->HandlerData
2286 mov
0(%r11),%r10d # HandlerData[0]
2287 lea
(%rsi,%r10),%r10 # end of prologue label
2288 cmp %r10,%rbx # context->Rip<end of prologue label
2289 jb
.Lcommon_seh_tail
2291 mov
152($context),%rax # pull context->Rsp
2293 mov
4(%r11),%r10d # HandlerData[1]
2294 lea
(%rsi,%r10),%r10 # epilogue label
2295 cmp %r10,%rbx # context->Rip>=epilogue label
2296 jae
.Lcommon_seh_tail
2298 lea
128+24+48(%rax),%rax
2300 lea
.Lmul_gather4_epilogue
(%rip),%rbx
2302 jne
.Lse_not_in_mul_gather4
2306 lea
-48-0xa8(%rax),%rsi
2307 lea
512($context),%rdi
2309 .long
0xa548f3fc # cld; rep movsq
2311 .Lse_not_in_mul_gather4
:
2318 mov
%rbx,144($context) # restore context->Rbx
2319 mov
%rbp,160($context) # restore context->Rbp
2320 mov
%r12,216($context) # restore context->R12
2321 mov
%r13,224($context) # restore context->R13
2322 mov
%r14,232($context) # restore context->R14
2323 mov
%r15,240($context) # restore context->R15
2328 mov
%rax,152($context) # restore context->Rsp
2329 mov
%rsi,168($context) # restore context->Rsi
2330 mov
%rdi,176($context) # restore context->Rdi
2332 mov
40($disp),%rdi # disp->ContextRecord
2333 mov
$context,%rsi # context
2334 mov \
$154,%ecx # sizeof(CONTEXT)
2335 .long
0xa548f3fc # cld; rep movsq
2338 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2339 mov
8(%rsi),%rdx # arg2, disp->ImageBase
2340 mov
0(%rsi),%r8 # arg3, disp->ControlPc
2341 mov
16(%rsi),%r9 # arg4, disp->FunctionEntry
2342 mov
40(%rsi),%r10 # disp->ContextRecord
2343 lea
56(%rsi),%r11 # &disp->HandlerData
2344 lea
24(%rsi),%r12 # &disp->EstablisherFrame
2345 mov
%r10,32(%rsp) # arg5
2346 mov
%r11,40(%rsp) # arg6
2347 mov
%r12,48(%rsp) # arg7
2348 mov
%rcx,56(%rsp) # arg8, (NULL)
2349 call
*__imp_RtlVirtualUnwind
(%rip)
2351 mov \
$1,%eax # ExceptionContinueSearch
2363 .size se_handler
,.-se_handler
2367 .rva
.LSEH_begin_rsaz_512_sqr
2368 .rva
.LSEH_end_rsaz_512_sqr
2369 .rva
.LSEH_info_rsaz_512_sqr
2371 .rva
.LSEH_begin_rsaz_512_mul
2372 .rva
.LSEH_end_rsaz_512_mul
2373 .rva
.LSEH_info_rsaz_512_mul
2375 .rva
.LSEH_begin_rsaz_512_mul_gather4
2376 .rva
.LSEH_end_rsaz_512_mul_gather4
2377 .rva
.LSEH_info_rsaz_512_mul_gather4
2379 .rva
.LSEH_begin_rsaz_512_mul_scatter4
2380 .rva
.LSEH_end_rsaz_512_mul_scatter4
2381 .rva
.LSEH_info_rsaz_512_mul_scatter4
2383 .rva
.LSEH_begin_rsaz_512_mul_by_one
2384 .rva
.LSEH_end_rsaz_512_mul_by_one
2385 .rva
.LSEH_info_rsaz_512_mul_by_one
2387 .rva
.LSEH_begin_rsaz_512_gather4
2388 .rva
.LSEH_end_rsaz_512_gather4
2389 .rva
.LSEH_info_rsaz_512_gather4
2393 .LSEH_info_rsaz_512_sqr
:
2396 .rva
.Lsqr_body
,.Lsqr_epilogue
# HandlerData[]
2397 .LSEH_info_rsaz_512_mul
:
2400 .rva
.Lmul_body
,.Lmul_epilogue
# HandlerData[]
2401 .LSEH_info_rsaz_512_mul_gather4
:
2404 .rva
.Lmul_gather4_body
,.Lmul_gather4_epilogue
# HandlerData[]
2405 .LSEH_info_rsaz_512_mul_scatter4
:
2408 .rva
.Lmul_scatter4_body
,.Lmul_scatter4_epilogue
# HandlerData[]
2409 .LSEH_info_rsaz_512_mul_by_one
:
2412 .rva
.Lmul_by_one_body
,.Lmul_by_one_epilogue
# HandlerData[]
2413 .LSEH_info_rsaz_512_gather4
:
2414 .byte
0x01,0x46,0x16,0x00
2415 .byte
0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
2416 .byte
0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
2417 .byte
0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
2418 .byte
0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
2419 .byte
0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
2420 .byte
0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
2421 .byte
0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
2422 .byte
0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
2423 .byte
0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
2424 .byte
0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
2425 .byte
0x07,0x01,0x15,0x00 # sub rsp,0xa8
2429 $code =~ s/\`([^\`]*)\`/eval $1/gem;