3 #******************************************************************************#
4 #* Copyright(c) 2012, Intel Corp. *#
5 #* Developers and authors: *#
6 #* Shay Gueron (1, 2), and Vlad Krasnov (1) *#
7 #* (1) Intel Architecture Group, Microprocessor and Chipset Development, *#
8 #* Israel Development Center, Haifa, Israel *#
9 #* (2) University of Haifa *#
10 #******************************************************************************#
11 #* This submission to OpenSSL is to be made available under the OpenSSL *#
12 #* license, and only to the OpenSSL project, in order to allow integration *#
13 #* into the publicly distributed code. ? *#
14 #* The use of this code, or portions of this code, or concepts embedded in *#
15 #* this code, or modification of this code and/or algorithm(s) in it, or the *#
16 #* use of this code for any other purpose than stated above, requires special *#
18 #******************************************************************************#
19 #******************************************************************************#
21 #* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS *#
22 #* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *#
23 #* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *#
24 #* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT*#
25 #* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, *#
26 #* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF *#
27 #* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS *#
28 #* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN *#
29 #* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) *#
30 #* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *#
31 #* POSSIBILITY OF SUCH DAMAGE. *#
32 #******************************************************************************#
34 #* [1] S. Gueron, "Efficient Software Implementations of Modular *#
35 #* Exponentiation", http://eprint.iacr.org/2011/239 *#
36 #* [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". *#
37 #* IEEE Proceedings of 9th International Conference on Information *#
38 #* Technology: New Generations (ITNG 2012), 821-823 (2012). *#
39 #* [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation*#
40 #* Journal of Cryptographic Engineering 2:31-43 (2012). *#
41 #* [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis *#
42 #* resistant 512-bit and 1024-bit modular exponentiation for optimizing *#
43 #* RSA1024 and RSA2048 on x86_64 platforms", *#
44 #* http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest*#
45 ################################################################################
47 # While original submission covers 512- and 1024-bit exponentiation,
48 # this module is limited to 512-bit version only (and as such
49 # accelerates RSA1024 sign). This is because improvement for longer
50 # keys is not high enough to justify the effort, highest measured
51 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
52 # for the moment of this writing!] Nor does this module implement
53 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
54 # to more modular mixture of C and assembly. And it's optimized even
55 # for processors other than Intel Core family (see table below for
56 # improvement coefficients).
59 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
60 # ----------------+---------------------------
61 # Opteron +13% |+5% +20%
62 # Bulldozer -0% |-1% +10%
64 # Westmere +5% |+14% +17%
65 # Sandy Bridge +2% |+12% +29%
66 # Ivy Bridge +1% |+11% +35%
67 # Haswell(**) -0% |+12% +39%
69 # VIA Nano +70% |+9% +25%
71 # (*) rsax engine and fips numbers are presented for reference
73 # (**) you might notice MULX code below, strangely enough gain is
74 # marginal, which is why code remains disabled;
78 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
80 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
82 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
83 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
84 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f
$xlate) or
85 die "can't locate x86_64-xlate.pl";
87 open OUT
,"| $^X $xlate $flavour $output";
90 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
92 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
98 .type rsaz_512_sqr
,\
@function,4
100 rsaz_512_sqr
: # 25-29% faster than rsaz_512_mul
110 movq
$mod, %rbp # common argument
118 movl
$times,128+8(%rsp)
165 addq
%r8, %r8 #shlq \$1, %r8
167 adcq
%r9, %r9 #shld \$1, %r8, %r9
228 lea
(%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
230 adcq
%r11, %r11 #shld \$1, %r10, %r11
268 lea
(%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
286 leaq
(%r10,%r13,2), %r13 #shld \$1, %r12, %r13
316 leaq
(%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
334 leaq
(%r12,%r15,2),%r15 #shld \$1, %r14, %r15
359 leaq
(%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
374 leaq
(%r12,%r9,2), %r9 #shld \$1, %r8, %r9
398 leaq
(%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
406 leaq
(%r15,%r11,2), %r11 #shld \$1, %r10, %r11
427 adcq
%r12, %r12 #shld \$1, %rbx, %r12
428 adcq
%r13, %r13 #shld \$1, %r12, %r13
429 adcq
%r14, %r14 #shld \$1, %r13, %r14
452 movq
$out, %xmm0 # off-load
456 mulx
16($inp), %rcx, %r10
458 mulx
24($inp), %rax, %r11
461 mulx
32($inp), %rcx, %r12
464 mulx
40($inp), %rax, %r13
467 mulx
48($inp), %rcx, %r14
470 mulx
56($inp), %rax, %r15
479 mulx
%rdx, %rax, %rdx
488 mulx
16($inp), %rax, %rbx
490 mulx
24($inp), $out, %r8
495 mulx
32($inp), %rax, %rbx
500 mulx
40($inp), $out, %r8
505 mulx
48($inp), %rax, %rbx
510 mulx
56($inp), $out, %r8
521 mulx
%rdx, %rax, %rcx
531 mulx
24($inp), $out, %r9
533 mulx
32($inp), %rax, %rcx
538 mulx
40($inp), $out, %r9
543 mulx
48($inp), %rax, %rcx
548 mulx
56($inp), $out, %r9
559 mulx
%rdx, %rax, %rdx
569 mulx
32($inp), %rax, %rbx
571 mulx
40($inp), $out, %r10
576 mulx
48($inp), %rax, %rbx
581 mulx
56($inp), $out, %r10
593 mulx
%rdx, %rax, %rdx
603 mulx
40($inp), $out, %r11
605 mulx
48($inp), %rax, %rcx
610 mulx
56($inp), $out, %r11
621 mulx
%rdx, %rax, %rdx
631 mulx
48($inp), %rax, %rbx
633 mulx
56($inp), $out, %r12
644 mulx
%rdx, %rax, %rdx
654 mulx
56($inp), %rax, %r13
663 mulx
%rdx, %rax, %rdx
673 mulx
%rdx, %rax, %rdx
694 call _rsaz_512_reduce
706 call _rsaz_512_subtract
710 movl
128+8(%rsp), $times
716 leaq
128+24+48(%rsp), %rax
726 .size rsaz_512_sqr
,.-rsaz_512_sqr
730 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
733 .type rsaz_512_mul
,\
@function,5
745 movq
$out, %xmm0 # off-load arguments
749 movq
$bp, %rbp # pass argument
764 call _rsaz_512_reduce
776 call _rsaz_512_subtract
778 leaq
128+24+48(%rsp), %rax
788 .size rsaz_512_mul
,.-rsaz_512_mul
792 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
794 .globl rsaz_512_mul_gather4
795 .type rsaz_512_mul_gather4
,\
@function,6
797 rsaz_512_mul_gather4
:
807 movl
64($bp,$pwr,4), %eax
808 movq
$out, %xmm0 # off-load arguments
809 movl
($bp,$pwr,4), %ebx
817 leaq
128($bp,$pwr,4), %rbp
818 mulq
%rbx # 0 iteration
972 call _rsaz_512_reduce
984 call _rsaz_512_subtract
986 leaq
128+24+48(%rsp), %rax
994 .Lmul_gather4_epilogue
:
996 .size rsaz_512_mul_gather4
,.-rsaz_512_mul_gather4
1000 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1002 .globl rsaz_512_mul_scatter4
1003 .type rsaz_512_mul_scatter4
,\
@function,6
1005 rsaz_512_mul_scatter4
:
1014 .Lmul_scatter4_body
:
1015 leaq
($tbl,$pwr,4), $tbl
1016 movq
$out, %xmm0 # off-load arguments
1036 call _rsaz_512_reduce
1043 adcq
104(%rsp), %r13
1044 adcq
112(%rsp), %r14
1045 adcq
120(%rsp), %r15
1049 call _rsaz_512_subtract
1051 movl
%r8d, 64*0($inp) # scatter
1053 movl
%r9d, 64*2($inp)
1055 movl
%r10d, 64*4($inp)
1057 movl
%r11d, 64*6($inp)
1059 movl
%r12d, 64*8($inp)
1061 movl
%r13d, 64*10($inp)
1063 movl
%r14d, 64*12($inp)
1065 movl
%r15d, 64*14($inp)
1067 movl
%r8d, 64*1($inp)
1068 movl
%r9d, 64*3($inp)
1069 movl
%r10d, 64*5($inp)
1070 movl
%r11d, 64*7($inp)
1071 movl
%r12d, 64*9($inp)
1072 movl
%r13d, 64*11($inp)
1073 movl
%r14d, 64*13($inp)
1074 movl
%r15d, 64*15($inp)
1076 leaq
128+24+48(%rsp), %rax
1077 movq
-48(%rax), %r15
1078 movq
-40(%rax), %r14
1079 movq
-32(%rax), %r13
1080 movq
-24(%rax), %r12
1081 movq
-16(%rax), %rbp
1084 .Lmul_scatter4_epilogue
:
1086 .size rsaz_512_mul_scatter4
,.-rsaz_512_mul_scatter4
1090 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1092 .globl rsaz_512_mul_by_one
1093 .type rsaz_512_mul_by_one
,\
@function,4
1095 rsaz_512_mul_by_one
:
1105 movq
$mod, %rbp # reassign argument
1118 movdqa
%xmm0, (%rsp)
1119 movdqa
%xmm0, 16(%rsp)
1120 movdqa
%xmm0, 32(%rsp)
1121 movdqa
%xmm0, 48(%rsp)
1122 movdqa
%xmm0, 64(%rsp)
1123 movdqa
%xmm0, 80(%rsp)
1124 movdqa
%xmm0, 96(%rsp)
1126 call _rsaz_512_reduce
1137 leaq
128+24+48(%rsp), %rax
1138 movq
-48(%rax), %r15
1139 movq
-40(%rax), %r14
1140 movq
-32(%rax), %r13
1141 movq
-24(%rax), %r12
1142 movq
-16(%rax), %rbp
1145 .Lmul_by_one_epilogue
:
1147 .size rsaz_512_mul_by_one
,.-rsaz_512_mul_by_one
1150 { # _rsaz_512_reduce
1152 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1154 # clobbers: everything except %rbp and %rdi
1156 .type _rsaz_512_reduce
,\
@abi-omnipotent
1163 imulq
128+8(%rsp), %rbx
1166 jmp
.Lreduction_loop
1197 movq
128+8(%rsp), %rsi
1236 jne
.Lreduction_loop
1240 movq
128+8(%rsp), %rdx # pull $n0
1243 jmp
.Lreduction_loop
1248 mulx
0(%rbp), %rax, %r8
1251 mulx
8(%rbp), %rax, %r9
1256 mulx
16(%rbp), %rax, %r10
1258 mov
128+8(%rsp), %rbx # pull $n0
1263 mulx
24(%rbp), %rax, %r11
1268 mulx
32(%rbp), %rax, %r12
1273 mulx
40(%rbp), %rax, %r13
1278 mulx
48(%rbp), %rax, %r14
1283 mulx
56(%rbp), %rax, %r15
1290 jne
.Lreduction_loop
1295 .size _rsaz_512_reduce
,.-_rsaz_512_reduce
1298 { # _rsaz_512_subtract
1299 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1301 # clobbers: everything but %rdi, %rsi and %rbp
1303 .type _rsaz_512_subtract
,\
@abi-omnipotent
1359 .size _rsaz_512_subtract
,.-_rsaz_512_subtract
1364 # input: %rsi - ap, %rbp - bp
1366 # clobbers: everything
1367 my ($ap,$bp) = ("%rsi","%rbp");
1369 .type __rsaz_512_mul
,\
@abi-omnipotent
1511 .size __rsaz_512_mul
,.-__rsaz_512_mul
1515 my ($out,$inp,$power)= $win64 ?
("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1517 .globl rsaz_512_scatter4
1518 .type rsaz_512_scatter4
,\
@abi-omnipotent
1521 leaq
($out,$power,4), $out
1531 leaq
128($out), $out
1535 .size rsaz_512_scatter4
,.-rsaz_512_scatter4
1537 .globl rsaz_512_gather4
1538 .type rsaz_512_gather4
,\
@abi-omnipotent
1541 leaq
($inp,$power,4), $inp
1548 leaq
128($inp), $inp
1556 .size rsaz_512_gather4
,.-rsaz_512_gather4
1560 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1561 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1569 .extern __imp_RtlVirtualUnwind
1570 .type se_handler
,\
@abi-omnipotent
1584 mov
120($context),%rax # pull context->Rax
1585 mov
248($context),%rbx # pull context->Rip
1587 mov
8($disp),%rsi # disp->ImageBase
1588 mov
56($disp),%r11 # disp->HandlerData
1590 mov
0(%r11),%r10d # HandlerData[0]
1591 lea
(%rsi,%r10),%r10 # end of prologue label
1592 cmp %r10,%rbx # context->Rip<end of prologue label
1593 jb
.Lcommon_seh_tail
1595 mov
152($context),%rax # pull context->Rsp
1597 mov
4(%r11),%r10d # HandlerData[1]
1598 lea
(%rsi,%r10),%r10 # epilogue label
1599 cmp %r10,%rbx # context->Rip>=epilogue label
1600 jae
.Lcommon_seh_tail
1602 lea
128+24+48(%rax),%rax
1610 mov
%rbx,144($context) # restore context->Rbx
1611 mov
%rbp,160($context) # restore context->Rbp
1612 mov
%r12,216($context) # restore context->R12
1613 mov
%r13,224($context) # restore context->R13
1614 mov
%r14,232($context) # restore context->R14
1615 mov
%r15,240($context) # restore context->R15
1620 mov
%rax,152($context) # restore context->Rsp
1621 mov
%rsi,168($context) # restore context->Rsi
1622 mov
%rdi,176($context) # restore context->Rdi
1624 mov
40($disp),%rdi # disp->ContextRecord
1625 mov
$context,%rsi # context
1626 mov \
$154,%ecx # sizeof(CONTEXT)
1627 .long
0xa548f3fc # cld; rep movsq
1630 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1631 mov
8(%rsi),%rdx # arg2, disp->ImageBase
1632 mov
0(%rsi),%r8 # arg3, disp->ControlPc
1633 mov
16(%rsi),%r9 # arg4, disp->FunctionEntry
1634 mov
40(%rsi),%r10 # disp->ContextRecord
1635 lea
56(%rsi),%r11 # &disp->HandlerData
1636 lea
24(%rsi),%r12 # &disp->EstablisherFrame
1637 mov
%r10,32(%rsp) # arg5
1638 mov
%r11,40(%rsp) # arg6
1639 mov
%r12,48(%rsp) # arg7
1640 mov
%rcx,56(%rsp) # arg8, (NULL)
1641 call
*__imp_RtlVirtualUnwind
(%rip)
1643 mov \
$1,%eax # ExceptionContinueSearch
1655 .size sqr_handler
,.-sqr_handler
1659 .rva
.LSEH_begin_rsaz_512_sqr
1660 .rva
.LSEH_end_rsaz_512_sqr
1661 .rva
.LSEH_info_rsaz_512_sqr
1663 .rva
.LSEH_begin_rsaz_512_mul
1664 .rva
.LSEH_end_rsaz_512_mul
1665 .rva
.LSEH_info_rsaz_512_mul
1667 .rva
.LSEH_begin_rsaz_512_mul_gather4
1668 .rva
.LSEH_end_rsaz_512_mul_gather4
1669 .rva
.LSEH_info_rsaz_512_mul_gather4
1671 .rva
.LSEH_begin_rsaz_512_mul_scatter4
1672 .rva
.LSEH_end_rsaz_512_mul_scatter4
1673 .rva
.LSEH_info_rsaz_512_mul_scatter4
1675 .rva
.LSEH_begin_rsaz_512_mul_by_one
1676 .rva
.LSEH_end_rsaz_512_mul_by_one
1677 .rva
.LSEH_info_rsaz_512_mul_by_one
1681 .LSEH_info_rsaz_512_sqr
:
1684 .rva
.Lsqr_body
,.Lsqr_epilogue
# HandlerData[]
1685 .LSEH_info_rsaz_512_mul
:
1688 .rva
.Lmul_body
,.Lmul_epilogue
# HandlerData[]
1689 .LSEH_info_rsaz_512_mul_gather4
:
1692 .rva
.Lmul_gather4_body
,.Lmul_gather4_epilogue
# HandlerData[]
1693 .LSEH_info_rsaz_512_mul_scatter4
:
1696 .rva
.Lmul_scatter4_body
,.Lmul_scatter4_epilogue
# HandlerData[]
1697 .LSEH_info_rsaz_512_mul_by_one
:
1700 .rva
.Lmul_by_one_body
,.Lmul_by_one_epilogue
# HandlerData[]
1704 $code =~ s/\`([^\`]*)\`/eval $1/gem;