[S390x] Improvements on documentation and instruction set usage for SHA3 permute

author Mamone Tarsha <maamoun.tk@googlemail.com>

Sun, 24 Oct 2021 18:39:11 +0000 (20:39 +0200)

committer Mamone Tarsha <maamoun.tk@googlemail.com>

Sun, 24 Oct 2021 18:39:11 +0000 (20:39 +0200)
author Mamone Tarsha <maamoun.tk@googlemail.com>
Sun, 24 Oct 2021 18:39:11 +0000 (20:39 +0200)
committer Mamone Tarsha <maamoun.tk@googlemail.com>
Sun, 24 Oct 2021 18:39:11 +0000 (20:39 +0200)
diff --git a/s390x/vf/sha3-permute.asm b/s390x/vf/sha3-permute.asm

index 517ce8948d75b98a661fd47253cb758c1a48e363..d656b97cb89395fba731c4e5df1d3bbeb1904d07 100644 (file)
--- a/s390x/vf/sha3-permute.asm
+++ b/s390x/vf/sha3-permute.asm
@@ -85,15 +85,17 @@ C void
  C sha3_permute(struct sha3_ctx *ctx)
  
  PROLOGUE(nettle_sha3_permute)
-    stmg           %r6,%r14,48(SP)
-    ALLOC_STACK(%r1,16)
+    stmg           %r6,%r14,48(SP)       C Save non-volatile general registers
+    ALLOC_STACK(%r1,16)                  C Allocate 16-byte space on stack
+    C Save non-volatile floating point registers
      std            %f8,0(%r1)
      std            %f9,8(%r1)
    
      lghi           COUNT,24*8
-    larl           RC,.rc
+    larl           RC,.rc                C Load address of rc data
      aghi           RC,-8
    
+    C Load state data
      lg             A00,0*8(STATE)
      vl             A0102,1*8(STATE)
      vl             A0304,3*8(STATE)
@@ -130,25 +132,31 @@ PROLOGUE(nettle_sha3_permute)
  
  .align  16
  .Loop:
+    C The theta step. Combine parity bits, then xor to state.
+    C D0 = C4 ^ (C1 <<< 1)
+    C D1 = C0 ^ (C2 <<< 1)
+    C D2 = C1 ^ (C3 <<< 1)
+    C D3 = C2 ^ (C4 <<< 1)
+    C D4 = C3 ^ (C0 <<< 1)
+
+    C Shift the words around, putting (C0, C1) in D12, (C2, C3) in
+    C   D34, and (C4, C0) in C34.
+
      vlvgg          D12,C0,0
      vmrhg          D12,D12,C12           C Holds C0, C1
      vpdi           D34,C12,C34,0b0100    C Holds C2, C3
      vpdi           C34,C34,D12,0b0100    C Holds C4, C0
-    vlgvg          D0,C34,0
+
      vlgvg          T0,C12,0
+    vlgvg          D0,C34,0
      rllg           T0,T0,1
      xgr            D0,T0
  
-    C Can use C12 as temporary
-    veslg          W0,D34,1
-    vesrlg         W1,D34,63
-    vx             D12,D12,W0
-    vx             D12,D12,W1            C Done D12
+    verllg         W0,D34,1
+    vx             D12,D12,W0            C Done D12
  
-    veslg          C12,C34,1
-    vesrlg         C34,C34,63
-    vx             D34,D34,C34
-    vx             D34,D34,C12           C Done D34
+    verllg         W1,C34,1
+    vx             D34,D34,W1            C Done D34
  
      xgr            A00,D0
      xgr            A05,D0
@@ -166,24 +174,37 @@ PROLOGUE(nettle_sha3_permute)
      vx             A1819,A1819,D34
      vx             A2324,A2324,D34
  
-    C Do the 1,2,3,4 row. First rotate, then permute.
-    vesrlg         W0,A0102,63
-    veslg          W1,A0102,62
-    vesrlg         W2,A0102,2
-    veslg          A0102,A0102,1
-    vo             W0,W0,A0102           C veslg 1  (A01)
-    vo             W2,W2,W1              C veslg 62 (A02)
-
-    veslg          A0102,A0304,28
-    vesrlg         W1,A0304,36
-    vo             A0102,A0102,W1        C veslg 28 (A03)
-    vesrlg         W1,A0304,37
-    veslg          A0304,A0304,27
-    vo             A0304,A0304,W1        C veslg 27 (A04)
+    C theta step done, no C, D or W temporaries alive.
+
+    C rho step. When doing the permutations, also
+    C transpose the rows of matrix into temporary
+    C coordinates to assist the chi step.
+    C Defer pi step to the last phase.
+
+    C The combined permutation + transpose gives the following
+    C cycles (rotation counts in parenthesis)
+    C   0 <- 0(0)
+    C   1 <- 3(28) <- 4(27) <- 2(62) <- 1(1)
+    C   5 <- 6(44) <- 9(20) <- 8(55) <- 5(36)
+    C   7 <- 7(6)
+    C   10 <- 12(43) <- 13(25) <- 11(10) <- 10(3)
+    C   14 <- 14(39)
+    C   15 <- 18(21) <- 17(15) <- 19(8) <- 15(41)
+    C   16 <- 16(45)
+    C   20 <- 24(14) <- 21(2) <- 22(61) <- 20(18)
+    C   23 <- 23(56)
+    
+    C Do the 1,2,3,4 row. First rotate (permute), then transpose.
+    verllg         W0,A0102,1            C verllg 1  (A01)
+    verllg         W2,A0102,62           C verllg 62 (A02)
+
+    verllg         A0102,A0304,28        C verllg 28 (A03)
+    verllg         A0304,A0304,27        C verllg 27 (A04)
  
      vmrhg          A0102,A0102,W0
      vmrlg          A0304,A0304,W2
  
+    C Do the 5,6,7,8,9 row.
      rllg           A05,A05,36
      vlvgg          W0,A05,0
      vlgvg          A05,A0607,0
@@ -194,17 +215,19 @@ PROLOGUE(nettle_sha3_permute)
      verllg         W1,A0809,55
      vmrhg          A0809,W0,W1           C Done A0809
  
-    rllg           A10,A10,42            C 42 + 25 = 3 (mod 64)
+    C Do the 10,11,12,13,14 row.
+    C Roatated using verllg with (25) later. 42 + 25 = 3 (mod 64)
+    rllg           A10,A10,42
      verllg         W0,A1112,10
      vlvgg          A1112,A10,0
      vlgvg          A10,A1112,1
      rllg           A10,A10,43            C Done A10
-
      vmrhg          A1112,A1112,A1314
      verllg         A1112,A1112,25        C Done A1112
      verllg         W2,A1314,39
      vpdi           A1314,W0,W2,0b0001    C Done A1314
  
+    C Do the 15,16,17,18,19 row.
      verllg         W0,A1819,8
      rllg           A15,A15,41
      vlvgg          W1,A15,1
@@ -215,15 +238,14 @@ PROLOGUE(nettle_sha3_permute)
      vpdi           A1617,A1617,W0,0b0001 C Done A1617
      vmrlg          A1819,A1819,W1        C Done A1819
  
+    C Do the 20,21,22,23,24 row.
      rllg           A20,A20,18
      vlvgg          W0,A20,1
      vlgvg          A20,A2324,1
      rllg           A20,A20,14            C Done A20
      verllg         A2324,A2324,56
-
      verllg         W2,A2122,2
      vmrhg          A2324,A2324,W2        C Done A2324
-
      verllg         A2122,A2122,61
      vmrlg          A2122,A2122,W0        C Done A2122
  
@@ -268,13 +290,25 @@ PROLOGUE(nettle_sha3_permute)
      vx             A0304,A0304,W0
      vx             A0809,A0809,W1
  
+    C iota step.
      lg             TMP,0(COUNT,RC)
      xgr            A00,TMP
  
-    C Transpose.
+    C Deferred pi step. Transpose the matrix from the temporary
+    C positions. The transpose gives the matrix with the
+    C following (x,y) coordinates.
+    C (0,0) <- (0,0), (0,2) <- (2,0), (0,4) <- (4,0)
+    C (0,3) <- (3,0), (0,1) <- (1,0), (1,3) <- (3,1)
+    C (1,0) <- (0,1), (1,2) <- (2,1), (1,4) <- (4,1)
+    C (1,1) <- (1,1), (2,1) <- (1,2), (2,3) <- (3,2)
+    C (2,0) <- (0,2), (2,2) <- (2,2), (2,4) <- (4,2)
+    C (3,4) <- (4,3), (3,1) <- (1,3), (3,3) <- (3,3)
+    C (3,0) <- (0,3), (3,2) <- (2,3), (4,2) <- (2,4)
+    C (4,4) <- (4,4), (4,1) <- (1,4), (4,3) <- (3,4)
+    C (4,0) <- (0,4)
+
      C Swap (A05, A10) <->  A0102, and (A15, A20) <->  A0304,
      C and also copy to C12 and C34 while at it.
-
      vlvgg          C12,A05,0
      vlvgg          C34,A15,0
      vlvgg          W0,A10,0
@@ -319,6 +353,7 @@ PROLOGUE(nettle_sha3_permute)
      vx             C12,C12,A2122
      clijne         COUNT,0,.Loop
  
+    C Save state data
      stg            A00,0*8(STATE)
      vst            A0102,1*8(STATE)
      vst            A0304,3*8(STATE)
@@ -339,10 +374,11 @@ PROLOGUE(nettle_sha3_permute)
      vst            A2122,21*8(STATE)
      vst            A2324,23*8(STATE)
  
+    C Load non-volatile floating point registers
      ld             %f8,0(%r1)
      ld             %f9,8(%r1)
-    FREE_STACK(16)
-    lmg            %r6,%r14,48(SP)
+    FREE_STACK(16)                       C Deallocate stack space
+    lmg            %r6,%r14,48(SP)       C Load non-volatile general registers
  
      br             RA
  EPILOGUE(nettle_sha3_permute)
author	Mamone Tarsha <maamoun.tk@googlemail.com>
	Sun, 24 Oct 2021 18:39:11 +0000 (20:39 +0200)
committer	Mamone Tarsha <maamoun.tk@googlemail.com>
	Sun, 24 Oct 2021 18:39:11 +0000 (20:39 +0200)