From: Niels Möller <nisse@lysator.liu.se>
Date: Mon, 22 Mar 2021 18:08:14 +0000 (+0100)
Subject: Merge arm64 fat support into master.
X-Git-Tag: nettle_3.8_release_20220602~131
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a3e38b1d36d189834deaa626111faa93bee95ca9;p=thirdparty%2Fnettle.git

Merge arm64 fat support into master.
---

a3e38b1d36d189834deaa626111faa93bee95ca9
diff --cc arm64/README
index 139a3cc1,7c4e1813..d2745d57
--- a/arm64/README
+++ b/arm64/README
@@@ -1,3 -1,42 +1,42 @@@
+ General-purpose Registers[1]
+ 
+ There are thirty-one, 64-bit, general-purpose (integer) registers visible to
+ the A64 instruction set; these are labeled r0-r30. In a 64-bit context these
+ registers are normally referred to using the names x0-x30; in a 32-bit context
+ the registers are specified by using w0-w30. Additionally, a stack-pointer
+ register, SP, can be used with a restricted number of instructions.
+ 
+ The first eight registers, r0-r7, are used to pass argument values into
+ a subroutine and to return result values from a function.
+ 
+ Software developers creating platform-independent code are advised to avoid
+ using r18 if at all possible. Most compilers provide a mechanism to prevent
+ specific registers from being used for general allocation; portable hand-coded
+ assembler should avoid it entirely. It should not be assumed that treating the
+ register as callee-saved will be sufficient to satisfy the requirements of the
+ platform. Virtualization code must, of course, treat the register as they would
+ any other resource provided to the virtual machine.
+ 
+ A subroutine invocation must preserve the contents of the registers r19-r29
+ and SP. All 64 bits of each value stored in r19-r29 must be preserved, even
+ when using the ILP32 data model.
+ 
+ SIMD and Floating-Point Registers[1]
+ 
+ Unlike in AArch32, in AArch64 the 128-bit and 64-bit views of a SIMD and
+ Floating-Point register do not overlap multiple registers in a narrower view,
+ so q1, d1 and s1 all refer to the same entry in the register bank.
+ 
+ The first eight registers, v0-v7, are used to pass argument values into
 -a subroutine and to return result values from a function. They may also 
++a subroutine and to return result values from a function. They may also
+ be used to hold intermediate values within a routine (but, in general,
+ only between subroutine calls).
+ 
+ Registers v8-v15 must be preserved by a callee across subroutine calls;
+ the remaining registers (v0-v7, v16-v31) do not need to be preserved
+ (or should be preserved by the caller). Additionally, only the bottom 64 bits
+ of each value stored in v8-v15 need to be preserved.
+ 
  Endianness
  
  Similar to arm, aarch64 can run with little-endian or big-endian memory
diff --cc arm64/crypto/gcm-hash.asm
index b77b08d6,b088360b..3e4c98d8
--- a/arm64/crypto/gcm-hash.asm
+++ b/arm64/crypto/gcm-hash.asm
@@@ -284,52 -314,28 +314,28 @@@ IF_LE(
  
      eor            C0.16b,C0.16b,D.16b
  
-     PMUL C0,H1M,H1L
+     PMUL(C0,H1M,H1L)
+ 
+     REDUCTION(D)
  
-     REDUCTION D
+     subs           x4,x4,#16
+     b.ne           L1_block_loop
  
- Lmod:
+ Lpartial:
      tst            LENGTH,#15
-     b.eq           Ldone
+     b.eq           Lghash_done
  
      ld1            {H1M.2d,H1L.2d},[TABLE]
 -    
 +
-     tbz            LENGTH,3,Lmod_8
-     ldr            C0D,[DATA],#8
- IF_LE(`
-     rev64          C0.16b,C0.16b
- ')
-     mov            x7,#0
-     mov            C0.d[1],x7
- Lmod_8:
-     tst            LENGTH,#7
-     b.eq           Lmod_8_done
-     mov            x6,#0
-     mov            x5,#64
-     and            x4,LENGTH,#7
- Lmod_8_loop:
-     mov            x7,#0
-     ldrb           w7,[DATA],#1
-     sub            x5,x5,#8
-     lsl            x7,x7,x5
-     orr            x6,x6,x7
-     subs           x4,x4,#1
-     b.ne           Lmod_8_loop
-     tbz            LENGTH,3,Lmod_8_load
-     mov            C0.d[1],x6
-     b              Lmod_8_done
- Lmod_8_load:
-     mov            x7,#0
-     mov            C0.d[0],x6
-     mov            C0.d[1],x7
- Lmod_8_done:
+     LOAD_REV_PARTIAL_BLOCK(C0)
+ 
      eor            C0.16b,C0.16b,D.16b
  
-     PMUL C0,H1M,H1L
+     PMUL(C0,H1M,H1L)
  
-     REDUCTION D
+     REDUCTION(D)
  
- Ldone:
+ Lghash_done:
  IF_LE(`
      rev64          D.16b,D.16b
  ')