From: Niels Möller Date: Fri, 22 Aug 2014 19:35:06 +0000 (+0200) Subject: Optimized x86_64 assembly for ecc_25519_modp. X-Git-Tag: nettle_3.1rc1~155^2~19 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=752861f7a90c80a6a00a6e5ba87f8e7880dc4871;p=thirdparty%2Fnettle.git Optimized x86_64 assembly for ecc_25519_modp. --- diff --git a/ChangeLog b/ChangeLog index b76e602e..956b0ca8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,9 @@ 2014-08-22 Niels Möller - * x86_64/ecc-25519-modp.asm: New file. Initial assembly - implementation, 30% speedup of ecc_25519_modp. + * x86_64/ecc-25519-modp.asm: New file. Assembly implementation, + initial version yields 30% speedup of ecc_25519_modp. Early + folding eliminates one pass of carry propagation, and yields + almost 20% additional speedup. * ecc-25519.c [HAVE_NATIVE_ecc_25519_modp]: Use assembly version if available. diff --git a/x86_64/ecc-25519-modp.asm b/x86_64/ecc-25519-modp.asm index b09262d1..b1622d5f 100644 --- a/x86_64/ecc-25519-modp.asm +++ b/x86_64/ecc-25519-modp.asm @@ -33,65 +33,62 @@ ifelse(< .file "ecc-25519-modp.asm" define(, <%rsi>) -define(, <%rdi>) C Overlaps unused ecc input +define(, <%rdi>) C Overlaps unused ecc input define(, <%rcx>) define(, <%r8>) -define(, <%r9>) -define(, <%r10>) -define(, <%r11>) -define(, <%r12>) +define(, <%r9>) +define(, <%r10>) +define(, <%r11>) +define(, <%rbx>) PROLOGUE(nettle_ecc_25519_modp) W64_ENTRY(2, 0) - push %r12 - + push %rbx + + C First fold the limbs affecting bit 255 + mov 56(RP), %rax mov $38, M - mov 32(RP), %rax mul M - mov %rax, U0 - mov %rdx, V1 + mov 24(RP), U3 + xor T0, T0 + add %rax, U3 + adc %rdx, T0 - mov 40(RP), %rax + mov 40(RP), %rax C Do this early as possible mul M - mov %rax, U1 - mov %rdx, V2 - mov 48(RP), %rax + add U3, U3 + adc T0, T0 + shr U3 C Undo shift, clear high bit + + C Fold the high limb again, together with RP[5] + imul $19, T0 + + mov (RP), U0 + mov 8(RP), U1 + mov 16(RP), U2 + add T0, U0 + adc %rax, U1 + mov 32(RP), %rax + adc %rdx, U2 + adc $0, U3 + + C Fold final two limbs, RP[4] and RP[6] mul M - mov %rax, U2 - mov %rdx, V3 - - mov 56(RP), %rax + mov %rax, T0 + mov 48(RP), %rax + mov %rdx, T1 mul M - - add V1, U1 - adc V2, U2 - adc V3, %rax - adc $0, %rdx - - shr M - C FIXME: Load and add earlier? - add (RP), U0 - adc 8(RP), U1 - adc 16(RP), U2 - adc 24(RP), %rax - adc $0, %rdx - - add %rax, %rax C Copy high bit to carry - adc %rdx, %rdx - shr %rax C Undo shift, clear high bit - imul M, %rdx - - add %rdx, U0 + add T0, U0 mov U0, (RP) - adc $0, U1 + adc T1, U1 mov U1, 8(RP) - adc $0, U2 + adc %rax, U2 mov U2, 16(RP) - adc $0, %rax - mov %rax, 24(RP) + adc %rdx, U3 + mov U3, 24(RP) - pop %r12 + pop %rbx W64_EXIT(2, 0) ret EPILOGUE(nettle_ecc_25519_modp)