From: Niels Möller Date: Mon, 22 Mar 2021 18:08:14 +0000 (+0100) Subject: Merge arm64 fat support into master. X-Git-Tag: nettle_3.8_release_20220602~131 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a3e38b1d36d189834deaa626111faa93bee95ca9;p=thirdparty%2Fnettle.git Merge arm64 fat support into master. --- a3e38b1d36d189834deaa626111faa93bee95ca9 diff --cc arm64/README index 139a3cc1,7c4e1813..d2745d57 --- a/arm64/README +++ b/arm64/README @@@ -1,3 -1,42 +1,42 @@@ + General-purpose Registers[1] + + There are thirty-one, 64-bit, general-purpose (integer) registers visible to + the A64 instruction set; these are labeled r0-r30. In a 64-bit context these + registers are normally referred to using the names x0-x30; in a 32-bit context + the registers are specified by using w0-w30. Additionally, a stack-pointer + register, SP, can be used with a restricted number of instructions. + + The first eight registers, r0-r7, are used to pass argument values into + a subroutine and to return result values from a function. + + Software developers creating platform-independent code are advised to avoid + using r18 if at all possible. Most compilers provide a mechanism to prevent + specific registers from being used for general allocation; portable hand-coded + assembler should avoid it entirely. It should not be assumed that treating the + register as callee-saved will be sufficient to satisfy the requirements of the + platform. Virtualization code must, of course, treat the register as they would + any other resource provided to the virtual machine. + + A subroutine invocation must preserve the contents of the registers r19-r29 + and SP. All 64 bits of each value stored in r19-r29 must be preserved, even + when using the ILP32 data model. + + SIMD and Floating-Point Registers[1] + + Unlike in AArch32, in AArch64 the 128-bit and 64-bit views of a SIMD and + Floating-Point register do not overlap multiple registers in a narrower view, + so q1, d1 and s1 all refer to the same entry in the register bank. + + The first eight registers, v0-v7, are used to pass argument values into -a subroutine and to return result values from a function. They may also ++a subroutine and to return result values from a function. They may also + be used to hold intermediate values within a routine (but, in general, + only between subroutine calls). + + Registers v8-v15 must be preserved by a callee across subroutine calls; + the remaining registers (v0-v7, v16-v31) do not need to be preserved + (or should be preserved by the caller). Additionally, only the bottom 64 bits + of each value stored in v8-v15 need to be preserved. + Endianness Similar to arm, aarch64 can run with little-endian or big-endian memory diff --cc arm64/crypto/gcm-hash.asm index b77b08d6,b088360b..3e4c98d8 --- a/arm64/crypto/gcm-hash.asm +++ b/arm64/crypto/gcm-hash.asm @@@ -284,52 -314,28 +314,28 @@@ IF_LE( eor C0.16b,C0.16b,D.16b - PMUL C0,H1M,H1L + PMUL(C0,H1M,H1L) + + REDUCTION(D) - REDUCTION D + subs x4,x4,#16 + b.ne L1_block_loop - Lmod: + Lpartial: tst LENGTH,#15 - b.eq Ldone + b.eq Lghash_done ld1 {H1M.2d,H1L.2d},[TABLE] - + - tbz LENGTH,3,Lmod_8 - ldr C0D,[DATA],#8 - IF_LE(` - rev64 C0.16b,C0.16b - ') - mov x7,#0 - mov C0.d[1],x7 - Lmod_8: - tst LENGTH,#7 - b.eq Lmod_8_done - mov x6,#0 - mov x5,#64 - and x4,LENGTH,#7 - Lmod_8_loop: - mov x7,#0 - ldrb w7,[DATA],#1 - sub x5,x5,#8 - lsl x7,x7,x5 - orr x6,x6,x7 - subs x4,x4,#1 - b.ne Lmod_8_loop - tbz LENGTH,3,Lmod_8_load - mov C0.d[1],x6 - b Lmod_8_done - Lmod_8_load: - mov x7,#0 - mov C0.d[0],x6 - mov C0.d[1],x7 - Lmod_8_done: + LOAD_REV_PARTIAL_BLOCK(C0) + eor C0.16b,C0.16b,D.16b - PMUL C0,H1M,H1L + PMUL(C0,H1M,H1L) - REDUCTION D + REDUCTION(D) - Ldone: + Lghash_done: IF_LE(` rev64 D.16b,D.16b ')