Merge arm64 fat support into master.

author Niels Möller <nisse@lysator.liu.se>

Mon, 22 Mar 2021 18:08:14 +0000 (19:08 +0100)

committer Niels Möller <nisse@lysator.liu.se>

Mon, 22 Mar 2021 18:08:14 +0000 (19:08 +0100)
author Niels Möller <nisse@lysator.liu.se>
Mon, 22 Mar 2021 18:08:14 +0000 (19:08 +0100)
committer Niels Möller <nisse@lysator.liu.se>
Mon, 22 Mar 2021 18:08:14 +0000 (19:08 +0100)
diff --cc arm64/README

index 139a3cc166a080cd96b33289392c8cf71c0a7e24,7c4e1813ee4ff011062c6398a688d51d9d682581..d2745d575cbe1db397101a93189990c1eb3465c9
--- 1/arm64/README
--- 2/arm64/README
+++ b/arm64/README
@@@ -1,3 -1,42 +1,42 @@@
- -a subroutine and to return result values from a function. They may also 
+ General-purpose Registers[1]
+ 
+ There are thirty-one, 64-bit, general-purpose (integer) registers visible to
+ the A64 instruction set; these are labeled r0-r30. In a 64-bit context these
+ registers are normally referred to using the names x0-x30; in a 32-bit context
+ the registers are specified by using w0-w30. Additionally, a stack-pointer
+ register, SP, can be used with a restricted number of instructions.
+ 
+ The first eight registers, r0-r7, are used to pass argument values into
+ a subroutine and to return result values from a function.
+ 
+ Software developers creating platform-independent code are advised to avoid
+ using r18 if at all possible. Most compilers provide a mechanism to prevent
+ specific registers from being used for general allocation; portable hand-coded
+ assembler should avoid it entirely. It should not be assumed that treating the
+ register as callee-saved will be sufficient to satisfy the requirements of the
+ platform. Virtualization code must, of course, treat the register as they would
+ any other resource provided to the virtual machine.
+ 
+ A subroutine invocation must preserve the contents of the registers r19-r29
+ and SP. All 64 bits of each value stored in r19-r29 must be preserved, even
+ when using the ILP32 data model.
+ 
+ SIMD and Floating-Point Registers[1]
+ 
+ Unlike in AArch32, in AArch64 the 128-bit and 64-bit views of a SIMD and
+ Floating-Point register do not overlap multiple registers in a narrower view,
+ so q1, d1 and s1 all refer to the same entry in the register bank.
+ 
+ The first eight registers, v0-v7, are used to pass argument values into
++a subroutine and to return result values from a function. They may also
+ be used to hold intermediate values within a routine (but, in general,
+ only between subroutine calls).
+ 
+ Registers v8-v15 must be preserved by a callee across subroutine calls;
+ the remaining registers (v0-v7, v16-v31) do not need to be preserved
+ (or should be preserved by the caller). Additionally, only the bottom 64 bits
+ of each value stored in v8-v15 need to be preserved.
+ 
   Endianness
   
   Similar to arm, aarch64 can run with little-endian or big-endian memory
diff --cc arm64/crypto/gcm-hash.asm

index b77b08d6f3cacbf4b1a42592cdad3779d8797c51,b088360b6cb86a180b907ca997e456043f535eb4..3e4c98d85ee36fcdff88c9bdb3be27c06841a5b6
--- 1/arm64/crypto/gcm-hash.asm
--- 2/arm64/crypto/gcm-hash.asm
+++ b/arm64/crypto/gcm-hash.asm
@@@ -284,52 -314,28 +314,28 @@@ IF_LE(
   
       eor            C0.16b,C0.16b,D.16b
   
-     PMUL C0,H1M,H1L
+     PMUL(C0,H1M,H1L)
+ 
+     REDUCTION(D)
   
-     REDUCTION D
+     subs           x4,x4,#16
+     b.ne           L1_block_loop
   
- Lmod:
+ Lpartial:
       tst            LENGTH,#15
-     b.eq           Ldone
+     b.eq           Lghash_done
   
       ld1            {H1M.2d,H1L.2d},[TABLE]
- -    
+ +
-     tbz            LENGTH,3,Lmod_8
-     ldr            C0D,[DATA],#8
- IF_LE(`
-     rev64          C0.16b,C0.16b
- ')
-     mov            x7,#0
-     mov            C0.d[1],x7
- Lmod_8:
-     tst            LENGTH,#7
-     b.eq           Lmod_8_done
-     mov            x6,#0
-     mov            x5,#64
-     and            x4,LENGTH,#7
- Lmod_8_loop:
-     mov            x7,#0
-     ldrb           w7,[DATA],#1
-     sub            x5,x5,#8
-     lsl            x7,x7,x5
-     orr            x6,x6,x7
-     subs           x4,x4,#1
-     b.ne           Lmod_8_loop
-     tbz            LENGTH,3,Lmod_8_load
-     mov            C0.d[1],x6
-     b              Lmod_8_done
- Lmod_8_load:
-     mov            x7,#0
-     mov            C0.d[0],x6
-     mov            C0.d[1],x7
- Lmod_8_done:
+     LOAD_REV_PARTIAL_BLOCK(C0)
+ 
       eor            C0.16b,C0.16b,D.16b
   
-     PMUL C0,H1M,H1L
+     PMUL(C0,H1M,H1L)
   
-     REDUCTION D
+     REDUCTION(D)
   
- Ldone:
+ Lghash_done:
   IF_LE(`
       rev64          D.16b,D.16b
   ')
author	Niels Möller <nisse@lysator.liu.se>
	Mon, 22 Mar 2021 18:08:14 +0000 (19:08 +0100)
committer	Niels Möller <nisse@lysator.liu.se>
	Mon, 22 Mar 2021 18:08:14 +0000 (19:08 +0100)
		1	2
arm64/README	patch \|	diff1 \|	diff2 \|	blob \| history
arm64/crypto/gcm-hash.asm	patch \|	diff1 \|	diff2 \|	blob \| history