From: Niels Möller Date: Thu, 9 Dec 2021 20:39:09 +0000 (+0100) Subject: x86_64: Improved ecc_secp256r1_redc X-Git-Tag: nettle_3.8_release_20220602~54 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f57640ead79484297635ebbddb8e8952cd42f395;p=thirdparty%2Fnettle.git x86_64: Improved ecc_secp256r1_redc * x86_64/ecc-secp256r1-redc.asm: New folding scheme with one less carry propagation phase, and fewer registers, avoiding save and restore of callee-save registers. 17% speedup of this function on AMD Ryzen 5, resulting in a modest improvement in ecdsa performance. --- diff --git a/ChangeLog b/ChangeLog index 4c523ec1..c7c5c993 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,11 @@ 2021-12-09 Niels Möller + * x86_64/ecc-secp256r1-redc.asm: New folding scheme with one less + carry propagation phase, and fewer registers, avoiding save and + restore of callee-save registers. 17% speedup of this function on + AMD Ryzen 5, resulting in a modest improvement in ecdsa + performance. + * powerpc64/ecc-secp256r1-redc.asm: New file, contributed by Amitay Isaacs. diff --git a/x86_64/ecc-secp256r1-redc.asm b/x86_64/ecc-secp256r1-redc.asm index 91f55225..bffa7726 100644 --- a/x86_64/ecc-secp256r1-redc.asm +++ b/x86_64/ecc-secp256r1-redc.asm @@ -1,7 +1,7 @@ C x86_64/ecc-secp256r1-redc.asm ifelse(` - Copyright (C) 2013 Niels Möller + Copyright (C) 2013, 2021 Niels Möller This file is part of GNU Nettle. @@ -39,69 +39,61 @@ define(`U0', `%rdi') C Overlaps unused modulo input define(`U1', `%rcx') define(`U2', `%rax') define(`U3', `%r8') -define(`U4', `%r9') -define(`U5', `%r10') -define(`U6', `%r11') -define(`F0', `%r12') -define(`F1', `%r13') -define(`F2', `%rbx') -define(`F3', `%rbp') - -C FOLD(x), sets (F3,F2,F1,F0) <-- (x << 160) - (x << 128) - (x<<32) +define(`F0', `%r9') +define(`F1', `%r10') +define(`F2', `%r11') +define(`F3', `%rdx') C Overlap XP, used only in final carry folding + +C FOLD(x), sets (x,F2,F1,F0 ) <-- (x << 192) - (x << 160) + (x << 128) + (x << 32) define(`FOLD', ` + mov $1, F0 + mov $1, F1 + mov $1, F2 + shl `$'32, F0 + shr `$'32, F1 + sub F0, F2 + sbb F1, $1 +') +C FOLDC(x), sets (x,F2,F1,F0) <-- ((x+c) << 192) - (x << 160) + (x << 128) + (x << 32) +define(`FOLDC', ` + mov $1, F0 + mov $1, F1 mov $1, F2 - mov $1, F3 - shl `$'32, F2 - shr `$'32, F3 - xor F0,F0 - xor F1,F1 - sub F2, F0 - sbb F3, F1 - sbb $1, F2 - sbb `$'0, F3 + adc `$'0, $1 C May overflow, but final result will not. + shl `$'32, F0 + shr `$'32, F1 + sub F0, F2 + sbb F1, $1 ') PROLOGUE(_nettle_ecc_secp256r1_redc) W64_ENTRY(3, 0) - C save all registers that need to be saved - push %rbx - push %rbp - push %r12 - push %r13 mov (XP), U0 FOLD(U0) mov 8(XP), U1 mov 16(XP), U2 mov 24(XP), U3 - sub F0, U1 - sbb F1, U2 - sbb F2, U3 - sbb F3, U0 C Add in later - - FOLD(U1) - mov 32(XP), U4 - sub F0, U2 - sbb F1, U3 - sbb F2, U4 - sbb F3, U1 - - FOLD(U2) - mov 40(XP), U5 - sub F0, U3 - sbb F1, U4 - sbb F2, U5 - sbb F3, U2 - - FOLD(U3) - mov 48(XP), U6 - sub F0, U4 - sbb F1, U5 - sbb F2, U6 - sbb F3, U3 - - add U4, U0 - adc U5, U1 - adc U6, U2 + add F0, U1 + adc F1, U2 + adc F2, U3 + adc 32(XP), U0 + + FOLDC(U1) + add F0, U2 + adc F1, U3 + adc F2, U0 + adc 40(XP), U1 + + FOLDC(U2) + add F0, U3 + adc F1, U0 + adc F2, U1 + adc 48(XP), U2 + + FOLDC(U3) + add F0, U0 + adc F1, U1 + adc F2, U2 adc 56(XP), U3 C Sum, including carry, is < 2^{256} + p. @@ -126,10 +118,6 @@ PROLOGUE(_nettle_ecc_secp256r1_redc) mov U3, 24(RP) - pop %r13 - pop %r12 - pop %rbp - pop %rbx W64_EXIT(3, 0) ret EPILOGUE(_nettle_ecc_secp256r1_redc)