From: Maamoun TK Date: Fri, 14 May 2021 05:45:33 +0000 (+0300) Subject: aarch64: Optimize SHA1 Compress X-Git-Tag: nettle_3.8_release_20220602~113^2~8 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=47cafcf29951b7e9c5c1d1c4f34d29c6b2bf84c6;p=thirdparty%2Fnettle.git aarch64: Optimize SHA1 Compress This patch optimizes SHA1 compress function for arm64 architecture by taking advantage of SHA-1 instructions of Armv8 crypto extension. The SHA-1 instructions: SHA1C: SHA1 hash update (choose) SHA1H: SHA1 fixed rotate SHA1M: SHA1 hash update (majority) SHA1P: SHA1 hash update (parity) SHA1SU0: SHA1 schedule update 0 SHA1SU1: SHA1 schedule update 1 Benchmark on gcc117 instance of CFarm before applying the patch: Algorithm mode Mbyte/s sha1 update 214.16 openssl sha1 update 849.44 hmac-sha1 64 bytes 61.69 hmac-sha1 256 bytes 131.50 hmac-sha1 1024 bytes 185.20 hmac-sha1 4096 bytes 204.55 hmac-sha1 single msg 210.97 Benchmark on gcc117 instance of CFarm after applying the patch: Algorithm mode Mbyte/s sha1 update 800.80 openssl sha1 update 849.17 hmac-sha1 64 bytes 166.10 hmac-sha1 256 bytes 409.24 hmac-sha1 1024 bytes 636.98 hmac-sha1 4096 bytes 739.20 hmac-sha1 single msg 775.67 --- diff --git a/arm64/README b/arm64/README index d2745d57..206bb773 100644 --- a/arm64/README +++ b/arm64/README @@ -83,5 +83,12 @@ particular care must be taken if the loaded data is then to be regarded as elements of e.g. a doubleword vector. Indicies may appear reversed on big-endian systems (because they are). +Hardware-accelerated SHA Instructions + +The SHA optimized cores are implemented using SHA hashing instructions added +to AArch64 in crypto extensions. The repository [3] illustrates using those +instructions for optimizing SHA hashing functions. + [1] https://github.com/ARM-software/abi-aa/releases/download/2020Q4/aapcs64.pdf [2] https://llvm.org/docs/BigEndianNEON.html +[3] https://github.com/noloader/SHA-Intrinsics diff --git a/arm64/crypto/sha1-compress.asm b/arm64/crypto/sha1-compress.asm new file mode 100644 index 00000000..de3d7b7e --- /dev/null +++ b/arm64/crypto/sha1-compress.asm @@ -0,0 +1,246 @@ +C arm64/crypto/sha1-compress.asm + +ifelse(` + Copyright (C) 2021 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C This implementation uses the SHA-1 instructions of Armv8 crypto +C extension. +C SHA1C: SHA1 hash update (choose) +C SHA1H: SHA1 fixed rotate +C SHA1M: SHA1 hash update (majority) +C SHA1P: SHA1 hash update (parity) +C SHA1SU0: SHA1 schedule update 0 +C SHA1SU1: SHA1 schedule update 1 + +.file "sha1-compress.asm" +.arch armv8-a+crypto + +.text + +C Register usage: + +define(`STATE', `x0') +define(`INPUT', `x1') + +define(`CONST0', `v0') +define(`CONST1', `v1') +define(`CONST2', `v2') +define(`CONST3', `v3') +define(`MSG0', `v4') +define(`MSG1', `v5') +define(`MSG2', `v6') +define(`MSG3', `v7') +define(`ABCD', `v16') +define(`ABCD_SAVED', `v17') +define(`E0', `v18') +define(`E0_SAVED', `v19') +define(`E1', `v20') +define(`TMP', `v21') + +C void nettle_sha1_compress(uint32_t *state, const uint8_t *input) + +PROLOGUE(nettle_sha1_compress) + C Initialize constants + mov w2,#0x7999 + movk w2,#0x5A82,lsl #16 + dup CONST0.4s,w2 + mov w2,#0xEBA1 + movk w2,#0x6ED9,lsl #16 + dup CONST1.4s,w2 + mov w2,#0xBCDC + movk w2,#0x8F1B,lsl #16 + dup CONST2.4s,w2 + mov w2,#0xC1D6 + movk w2,#0xCA62,lsl #16 + dup CONST3.4s,w2 + + C Load state + add x2,STATE,#16 + movi E0.4s,#0 + ld1 {ABCD.4s},[STATE] + ld1 {E0.s}[0],[x2] + + C Save state + mov ABCD_SAVED.16b,ABCD.16b + mov E0_SAVED.16b,E0.16b + + C Load message + ld1 {MSG0.16b,MSG1.16b,MSG2.16b,MSG3.16b},[INPUT] + + C Reverse for little endian + rev32 MSG0.16b,MSG0.16b + rev32 MSG1.16b,MSG1.16b + rev32 MSG2.16b,MSG2.16b + rev32 MSG3.16b,MSG3.16b + + C Rounds 0-3 + add TMP.4s,MSG0.4s,CONST0.4s + sha1h SFP(E1),SFP(ABCD) + sha1c QFP(ABCD),SFP(E0),TMP.4s + sha1su0 MSG0.4s,MSG1.4s,MSG2.4s + + C Rounds 4-7 + add TMP.4s,MSG1.4s,CONST0.4s + sha1h SFP(E0),SFP(ABCD) + sha1c QFP(ABCD),SFP(E1),TMP.4s + sha1su1 MSG0.4s,MSG3.4s + sha1su0 MSG1.4s,MSG2.4s,MSG3.4s + + C Rounds 8-11 + add TMP.4s,MSG2.4s,CONST0.4s + sha1h SFP(E1),SFP(ABCD) + sha1c QFP(ABCD),SFP(E0),TMP.4s + sha1su1 MSG1.4s,MSG0.4s + sha1su0 MSG2.4s,MSG3.4s,MSG0.4s + + C Rounds 12-15 + add TMP.4s,MSG3.4s,CONST0.4s + sha1h SFP(E0),SFP(ABCD) + sha1c QFP(ABCD),SFP(E1),TMP.4s + sha1su1 MSG2.4s,MSG1.4s + sha1su0 MSG3.4s,MSG0.4s,MSG1.4s + + C Rounds 16-19 + add TMP.4s,MSG0.4s,CONST0.4s + sha1h SFP(E1),SFP(ABCD) + sha1c QFP(ABCD),SFP(E0),TMP.4s + sha1su1 MSG3.4s,MSG2.4s + sha1su0 MSG0.4s,MSG1.4s,MSG2.4s + + C Rounds 20-23 + add TMP.4s,MSG1.4s,CONST1.4s + sha1h SFP(E0),SFP(ABCD) + sha1p QFP(ABCD),SFP(E1),TMP.4s + sha1su1 MSG0.4s,MSG3.4s + sha1su0 MSG1.4s,MSG2.4s,MSG3.4s + + C Rounds 24-27 + add TMP.4s,MSG2.4s,CONST1.4s + sha1h SFP(E1),SFP(ABCD) + sha1p QFP(ABCD),SFP(E0),TMP.4s + sha1su1 MSG1.4s,MSG0.4s + sha1su0 MSG2.4s,MSG3.4s,MSG0.4s + + C Rounds 28-31 + add TMP.4s,MSG3.4s,CONST1.4s + sha1h SFP(E0),SFP(ABCD) + sha1p QFP(ABCD),SFP(E1),TMP.4s + sha1su1 MSG2.4s,MSG1.4s + sha1su0 MSG3.4s,MSG0.4s,MSG1.4s + + C Rounds 32-35 + add TMP.4s,MSG0.4s,CONST1.4s + sha1h SFP(E1),SFP(ABCD) + sha1p QFP(ABCD),SFP(E0),TMP.4s + sha1su1 MSG3.4s,MSG2.4s + sha1su0 MSG0.4s,MSG1.4s,MSG2.4s + + C Rounds 36-39 + add TMP.4s,MSG1.4s,CONST1.4s + sha1h SFP(E0),SFP(ABCD) + sha1p QFP(ABCD),SFP(E1),TMP.4s + sha1su1 MSG0.4s,MSG3.4s + sha1su0 MSG1.4s,MSG2.4s,MSG3.4s + + C Rounds 40-43 + add TMP.4s,MSG2.4s,CONST2.4s + sha1h SFP(E1),SFP(ABCD) + sha1m QFP(ABCD),SFP(E0),TMP.4s + sha1su1 MSG1.4s,MSG0.4s + sha1su0 MSG2.4s,MSG3.4s,MSG0.4s + + C Rounds 44-47 + add TMP.4s,MSG3.4s,CONST2.4s + sha1h SFP(E0),SFP(ABCD) + sha1m QFP(ABCD),SFP(E1),TMP.4s + sha1su1 MSG2.4s,MSG1.4s + sha1su0 MSG3.4s,MSG0.4s,MSG1.4s + + C Rounds 48-51 + add TMP.4s,MSG0.4s,CONST2.4s + sha1h SFP(E1),SFP(ABCD) + sha1m QFP(ABCD),SFP(E0),TMP.4s + sha1su1 MSG3.4s,MSG2.4s + sha1su0 MSG0.4s,MSG1.4s,MSG2.4s + + C Rounds 52-55 + add TMP.4s,MSG1.4s,CONST2.4s + sha1h SFP(E0),SFP(ABCD) + sha1m QFP(ABCD),SFP(E1),TMP.4s + sha1su1 MSG0.4s,MSG3.4s + sha1su0 MSG1.4s,MSG2.4s,MSG3.4s + + C Rounds 56-59 + add TMP.4s,MSG2.4s,CONST2.4s + sha1h SFP(E1),SFP(ABCD) + sha1m QFP(ABCD),SFP(E0),TMP.4s + sha1su1 MSG1.4s,MSG0.4s + sha1su0 MSG2.4s,MSG3.4s,MSG0.4s + + C Rounds 60-63 + add TMP.4s,MSG3.4s,CONST3.4s + sha1h SFP(E0),SFP(ABCD) + sha1p QFP(ABCD),SFP(E1),TMP.4s + sha1su1 MSG2.4s,MSG1.4s + sha1su0 MSG3.4s,MSG0.4s,MSG1.4s + + C Rounds 64-67 + add TMP.4s,MSG0.4s,CONST3.4s + sha1h SFP(E1),SFP(ABCD) + sha1p QFP(ABCD),SFP(E0),TMP.4s + sha1su1 MSG3.4s,MSG2.4s + sha1su0 MSG0.4s,MSG1.4s,MSG2.4s + + C Rounds 68-71 + add TMP.4s,MSG1.4s,CONST3.4s + sha1h SFP(E0),SFP(ABCD) + sha1p QFP(ABCD),SFP(E1),TMP.4s + sha1su1 MSG0.4s,MSG3.4s + + C Rounds 72-75 + add TMP.4s,MSG2.4s,CONST3.4s + sha1h SFP(E1),SFP(ABCD) + sha1p QFP(ABCD),SFP(E0),TMP.4s + + C Rounds 76-79 + add TMP.4s,MSG3.4s,CONST3.4s + sha1h SFP(E0),SFP(ABCD) + sha1p QFP(ABCD),SFP(E1),TMP.4s + + C Combine state + add E0.4s,E0.4s,E0_SAVED.4s + add ABCD.4s,ABCD.4s,ABCD_SAVED.4s + + C Store state + st1 {ABCD.4s},[STATE] + st1 {E0.s}[0],[x2] + + ret +EPILOGUE(nettle_sha1_compress) diff --git a/arm64/machine.m4 b/arm64/machine.m4 index e69de29b..7df62bcc 100644 --- a/arm64/machine.m4 +++ b/arm64/machine.m4 @@ -0,0 +1,7 @@ +C Get 32-bit floating-point register from vector register +C SFP(VR) +define(`SFP',``s'substr($1,1,len($1))') + +C Get 128-bit floating-point register from vector register +C QFP(VR) +define(`QFP',``q'substr($1,1,len($1))')