From 54f3d200d32e70aa1c5b4d72e6c879e3da769926 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 9 Oct 2005 09:53:58 +0000 Subject: [PATCH] Throw in bn/asm/x86-mont.pl Montgomery multiplication "teaser". --- Configure | 8 +- TABLE | 52 ++++----- crypto/bn/Makefile | 6 + crypto/bn/asm/x86-mont.pl | 238 ++++++++++++++++++++++++++++++++++++++ crypto/perlasm/x86unix.pl | 8 +- 5 files changed, 281 insertions(+), 31 deletions(-) create mode 100755 crypto/bn/asm/x86-mont.pl diff --git a/Configure b/Configure index 2c40de8ff6..9a534712fd 100755 --- a/Configure +++ b/Configure @@ -114,9 +114,9 @@ my $tlib="-lnsl -lsocket"; my $bits1="THIRTY_TWO_BIT "; my $bits2="SIXTY_FOUR_BIT "; -my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o:rm86-elf.o:r586-elf.o"; -my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o:rm86-cof.o:r586-cof.o"; -my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o"; +my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o mo86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o:rm86-elf.o:r586-elf.o"; +my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o mo86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o:rm86-cof.o:r586-cof.o"; +my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o mo86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o"; my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o::"; my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o::"; @@ -1171,7 +1171,7 @@ $bn_obj = $bn_asm unless $bn_obj ne ""; $cflags.=" -DOPENSSL_BN_ASM_PART_WORDS" if ($bn_obj =~ /bn86/); $cflags.=" -DOPENSSL_IA32_SSE2" if (!$no_sse2 && $bn_obj =~ /bn86/); -$cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /\-mont/); +$cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /\-mont|mo86\-/); $des_obj=$des_enc unless ($des_obj =~ /\.o$/); $bf_obj=$bf_enc unless ($bf_obj =~ /\.o$/); diff --git a/TABLE b/TABLE index 58a8a73aa5..5e9f14e3b7 100644 --- a/TABLE +++ b/TABLE @@ -198,7 +198,7 @@ $sys_id = $lflags = $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-out.o -$bn_obj = bn86-out.o co86-out.o +$bn_obj = bn86-out.o co86-out.o mo86-out.o $des_obj = dx86-out.o yx86-out.o $aes_obj = ax86-out.o $bf_obj = bx86-out.o @@ -225,7 +225,7 @@ $sys_id = $lflags = $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -279,7 +279,7 @@ $sys_id = CYGWIN32 $lflags = $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-cof.o -$bn_obj = bn86-cof.o co86-cof.o +$bn_obj = bn86-cof.o co86-cof.o mo86-cof.o $des_obj = dx86-cof.o yx86-cof.o $aes_obj = ax86-cof.o $bf_obj = bx86-cof.o @@ -333,7 +333,7 @@ $sys_id = MSDOS $lflags = -L/dev/env/WATT_ROOT/lib -lwatt $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-out.o -$bn_obj = bn86-out.o co86-out.o +$bn_obj = bn86-out.o co86-out.o mo86-out.o $des_obj = dx86-out.o yx86-out.o $aes_obj = ax86-out.o $bf_obj = bx86-out.o @@ -819,7 +819,7 @@ $sys_id = $lflags = -ldl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -1008,7 +1008,7 @@ $sys_id = $lflags = $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -1197,7 +1197,7 @@ $sys_id = $lflags = $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -1278,7 +1278,7 @@ $sys_id = $lflags = -ldl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -1305,7 +1305,7 @@ $sys_id = $lflags = -ldl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -1386,7 +1386,7 @@ $sys_id = $lflags = -lefence -ldl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -1413,7 +1413,7 @@ $sys_id = $lflags = -ldl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -1440,7 +1440,7 @@ $sys_id = $lflags = -ldl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -1467,7 +1467,7 @@ $sys_id = $lflags = -ldl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -1494,7 +1494,7 @@ $sys_id = $lflags = $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -1629,7 +1629,7 @@ $sys_id = $lflags = -rdynamic -ldl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -1791,7 +1791,7 @@ $sys_id = $lflags = -lnsl -lsocket $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -2250,7 +2250,7 @@ $sys_id = $lflags = -ldl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -2547,7 +2547,7 @@ $sys_id = $lflags = $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-out.o -$bn_obj = bn86-out.o co86-out.o +$bn_obj = bn86-out.o co86-out.o mo86-out.o $des_obj = dx86-out.o yx86-out.o $aes_obj = ax86-out.o $bf_obj = bx86-out.o @@ -2574,7 +2574,7 @@ $sys_id = $lflags = -ldl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -2655,7 +2655,7 @@ $sys_id = $lflags = -ldl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -2925,7 +2925,7 @@ $sys_id = MINGW32 $lflags = -lwsock32 -lgdi32 $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT EXPORT_VAR_AS_FN $cpuid_obj = x86cpuid-cof.o -$bn_obj = bn86-cof.o co86-cof.o +$bn_obj = bn86-cof.o co86-cof.o mo86-cof.o $des_obj = dx86-cof.o yx86-cof.o $aes_obj = ax86-cof.o $bf_obj = bx86-cof.o @@ -3330,7 +3330,7 @@ $sys_id = $lflags = -lsocket -lnsl $bn_ops = DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -3357,7 +3357,7 @@ $sys_id = $lflags = -lsocket -lnsl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -3573,7 +3573,7 @@ $sys_id = $lflags = -lsocket -lnsl -ldl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -3897,7 +3897,7 @@ $sys_id = $lflags = -lsocket -lnsl $bn_ops = BN_LLONG MD2_CHAR RC4_INDEX DES_PTR DES_RISC1 DES_UNROLL $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o @@ -3924,7 +3924,7 @@ $sys_id = $lflags = -lsocket -lnsl $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o -$bn_obj = bn86-elf.o co86-elf.o +$bn_obj = bn86-elf.o co86-elf.o mo86-elf.o $des_obj = dx86-elf.o yx86-elf.o $aes_obj = ax86-elf.o $bf_obj = bx86-elf.o diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile index a818ea22eb..06ebed08eb 100644 --- a/crypto/bn/Makefile +++ b/crypto/bn/Makefile @@ -67,16 +67,22 @@ bn86-elf.s: asm/bn-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) bn-586.pl elf $(CFLAGS) > ../$@) co86-elf.s: asm/co-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) co-586.pl elf $(CFLAGS) > ../$@) +mo86-elf.s: asm/x86-mont.pl ../perlasm/x86asm.pl + (cd asm; $(PERL) x86-mont.pl elf $(CFLAGS) > ../$@) # COFF bn86-cof.s: asm/bn-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) bn-586.pl coff $(CFLAGS) > ../$@) co86-cof.s: asm/co-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) co-586.pl coff $(CFLAGS) > ../$@) +mo86-cof.s: asm/x86-mont.pl ../perlasm/x86asm.pl + (cd asm; $(PERL) x86-mont.pl coff $(CFLAGS) > ../$@) # a.out bn86-out.s: asm/bn-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) bn-586.pl a.out $(CFLAGS) > ../$@) co86-out.s: asm/co-586.pl ../perlasm/x86asm.pl (cd asm; $(PERL) co-586.pl a.out $(CFLAGS) > ../$@) +mo86-out.s: asm/x86-mont.pl ../perlasm/x86asm.pl + (cd asm; $(PERL) x86-mont.pl a.out $(CFLAGS) > ../$@) sparcv8.o: asm/sparcv8.S $(CC) $(CFLAGS) -c asm/sparcv8.S diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl new file mode 100755 index 0000000000..dbf5cb173d --- /dev/null +++ b/crypto/bn/asm/x86-mont.pl @@ -0,0 +1,238 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. Rights for redistribution and usage in source and binary +# forms are granted according to the OpenSSL license. +# ==================================================================== + +# This is a "teaser" code, as it can be improved in several ways... +# First of all non-SSE2 path should be implemented (yes, for now it +# performs Montgomery multiplication/convolution only on SSE2-capable +# CPUs such as P4, others fall down to original code). Then inner loop +# can be unrolled and modulo-scheduled to improve ILP and possibly +# moved to 128-bit XMM register bank (though it would require input +# rearrangement and/or increase bus bandwidth utilization). Dedicated +# squaring procedure should give further performance improvement... +# Yet, for being draft, the code improves rsa512 *sign* benchmark by +# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) + +push(@INC,"perlasm","../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],$0); + +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +&external_label("OPENSSL_ia32cap_P") if ($sse2); + +&function_begin("bn_mul_mont",$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); + +$i="ebx"; +$j="ecx"; +$ap="esi"; +$rp="edi"; $bp="edi"; # overlapping variables!!! +$np="edx"; +$num="ebp"; +$tp="esp"; + +$bias=2; # amount of extra words in tp + # (rounded up to even value) +$_rp=&DWP(4*($bias+0),"esp",$num,4); # stack frame layout below tp +$_ap=&DWP(4*($bias+1),"esp",$num,4); +$_bp=&DWP(4*($bias+2),"esp",$num,4); +$_np=&DWP(4*($bias+3),"esp",$num,4); +$_n0=&DWP(4*($bias+4),"esp",$num,4); +$_sp=&DWP(4*($bias+5),"esp",$num,4); + +$acc0="mm0"; # mmx register bank layout +$acc1="mm1"; +$car0="mm2"; +$car1="mm3"; +$mul0="mm4"; +$mul1="mm5"; +$temp="mm6"; +$mask="mm7"; + +if($sse2) { + &picmeup("eax","OPENSSL_ia32cap_P"); + &bt (&DWP(0,"eax"),26); + &mov ("eax",0); # zero signals "we did nothing" + &jnc (&label("non_sse2")); + + ################################# load argument block... + &mov ("eax",&wparam(0)); # BN_ULONG *rp + &mov ("ebx",&wparam(1)); # const BN_ULONG *ap + &mov ("ecx",&wparam(2)); # const BN_ULONG *bp + &mov ("edx",&wparam(3)); # const BN_ULONG *np + &mov ("esi",&wparam(4)); # BN_ULONG n0 + &mov ($num,&wparam(5)); # int num + + &mov ("edi","esp"); # saved stack pointer! + &add ($num,$bias+6); + &neg ($num); + &lea ("esp",&DWP(0,"esp",$num,4)); # alloca(4*(num+$bias+6)) + &neg ($num); + &and ("esp",-1024); # minimize TLB utilization + &sub ($num,$bias+6); # num is restored to its original value + # and will remain constant from now... + + &mov ($_rp,"eax"); # ... save a copy of argument block + &mov ($_ap,"ebx"); + &mov ($_bp,"ecx"); + &mov ($_np,"edx"); + &mov ($_n0,"esi"); + &mov ($_sp,"edi"); # saved stack pointer! + + &mov ("eax",-1); + &movd ($mask,"eax"); # mask 32 lower bits + + &mov ($ap,$_ap); # load input pointers + &mov ($bp,$_bp); + &mov ($np,$_np); + + &xor ($i,$i); # i=0 + &xor ($j,$j); # j=0 + + &movd ($mul0,&DWP(0,$bp)); # bp[0] + &movd ($mul1,&DWP(0,$ap)); # ap[0] + &movd ($car1,&DWP(0,$np)); # np[0] + + &pmuludq($mul1,$mul0); # ap[0]*bp[0] + &movq ($car0,$mul1); + &movq ($acc0,$mul1); # I wish movd worked for + &pand ($acc0,$mask); # inter-register transfers + + &pmuludq($mul1,$_n0); # *=n0 + + &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 + &paddq ($car1,$acc0); + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &inc ($j); # j++ +&set_label("1st"); + &movd ($acc0,&DWP(0,$ap,$j,4)); # ap[j] + &movd ($acc1,&DWP(0,$np,$j,4)); # np[j] + &pmuludq($acc0,$mul0); # ap[j]*bp[0] + &pmuludq($acc1,$mul1); # np[j]*m1 + + &paddq ($car0,$acc0); # +=c0 + &movq ($acc0,$car0); + &pand ($acc0,$mask); + + &paddq ($car1,$acc1); # +=c1 + &paddq ($car1,$acc0); # +=ap[j]*bp[0]; + &movd (&DWP(-4,"esp",$j,4),$car1); # tp[j-1]= + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &lea ($j,&DWP(1,$j)); + &cmp ($j,$num); + &jl (&label("1st")); + + &paddq ($car1,$car0); + &movq (&DWP(-4,"esp",$num,4),$car1); + + &inc ($i); # i++ +&set_label("outer"); + &xor ($j,$j); # j=0 + + &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] + &movd ($mul1,&DWP(0,$ap)); # ap[0] + &movd ($temp,&DWP(0,"esp")); # tp[0] + &movd ($car1,&DWP(0,$np,$j,4)); # np[0] + &pmuludq($mul1,$mul0); # ap[0]*bp[i] + + &paddq ($mul1,$temp); # +=tp[0] + &movq ($acc0,$mul1); + &movq ($car0,$mul1); + &pand ($acc0,$mask); + + &pmuludq($mul1,$_n0); # *=n0 + + &pmuludq($car1,$mul1); + &paddq ($car1,$acc0); + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &inc ($j); # j++ +&set_label("inner"); + &movd ($acc0,&DWP(0,$ap,$j,4)); # ap[j] + &movd ($acc1,&DWP(0,$np,$j,4)); # np[j] + &movd ($temp,&DWP(0,"esp",$j,4)); # tp[j] + &pmuludq($acc0,$mul0); # ap[j]*bp[i] + &pmuludq($acc1,$mul1); # np[j]*m1 + &paddq ($car0,$temp); # +=tp[j] + &paddq ($car0,$acc0); # +=c0 + &movq ($acc0,$car0); + &pand ($acc0,$mask); + + &paddq ($car1,$acc1); # +=c1 + &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] + &movd (&DWP(-4,"esp",$j,4),$car1); # tp[j-1] + + &psrlq ($car0,32); + &psrlq ($car1,32); + + &lea ($j,&DWP(1,$j)); # j++ + &cmp ($j,$num); + &jl (&label("inner")); + + &movd ($temp,&DWP(0,"esp",$num,4)); + &paddq ($car1,$car0); + &paddq ($car1,$temp); + &movq (&DWP(-4,"esp",$num,4),$car1); + + &lea ($i,&DWP(1,$i)); # i++ + &cmp ($i,$num); + &jl (&label("outer")); + + &emms (); # done with mmx bank + + &mov ("esi",&DWP(0,"esp",$num,4)); # load upmost overflow bit + &mov ($rp,$_rp); # load result pointer + # [$ap and $bp are zapped] + &xor ($i,$i); # i=0 + &lea ($j,&DWP(-1,$num)); # j=num-1 + &cmp ("esi",0); # clears CF unconditionally + &jnz (&label("sub")); + &mov ("eax",&DWP(0,"esp",$j,4)); + &cmp ("eax",&DWP(0,$np,$j,4)); # tp[num-1]-np[num-1]? + &jae (&label("sub")); # if taken CF is cleared +&set_label("copy"); + &mov ("eax",&DWP(0,"esp",$j,4)); + &mov (&DWP(0,$rp,$j,4),"eax"); # rp[i]=tp[i] + &mov (&DWP(0,"esp",$j,4),$j); # zap temporary vector + &dec ($j); + &jge (&label("copy")); + &jmp (&label("exit_sse2")); + +&set_label("sub",4); + &mov ("eax",&DWP(0,"esp",$i,4)); + &sbb ("eax",&DWP(0,$np,$i,4)); + &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] + &lea ($i,&DWP(1,$i)); # i++ + &dec ($j); # doesn't affect CF! + &jge (&label("sub")); + &lea ($j,&DWP(-1,$num)); # j=num-1 + &sbb ("esi",0); # esi holds upmost overflow bit + &jc (&label("copy")); +&set_label("zap"); + &mov (&DWP(0,"esp",$j,4),$i); # zap temporary vector + &dec ($j); + &jge (&label("zap")); + +&set_label("exit_sse2"); + &mov ("esp",$_sp); # pull saved stack pointer + &mov ("eax",1); +&set_label("non_sse2"); +} + +&function_end("bn_mul_mont"); + +&asm_finish(); diff --git a/crypto/perlasm/x86unix.pl b/crypto/perlasm/x86unix.pl index 9bc5c0e893..38c7a6df42 100644 --- a/crypto/perlasm/x86unix.pl +++ b/crypto/perlasm/x86unix.pl @@ -215,7 +215,6 @@ sub main'cld { &out0("cld"); } # SSE2 sub main'emms { &out0("emms"); } sub main'movd { &out2("movd",@_); } -sub main'movq { &out2("movq",@_); } sub main'movdqu { &out2("movdqu",@_); } sub main'movdqa { &out2("movdqa",@_); } sub main'movdq2q{ &out2("movdq2q",@_); } @@ -227,6 +226,13 @@ sub main'psllq { &out2("psllq",@_); } sub main'pxor { &out2("pxor",@_); } sub main'por { &out2("por",@_); } sub main'pand { &out2("pand",@_); } +sub main'movq { + local($p1,$p2)=@_; + if ($p1=~/^mm[0-7]$/ && $p2=~/^mm[0-7]$/) + # movq between mmx registers sinks Intel CPUs + { push(@out,"\tpshufw\t\$0xe4,%$p2,%$p1\n"); } + else { &out2("movq",@_); } + } # The bswapl instruction is new for the 486. Emulate if i386. sub main'bswap -- 2.39.5