ARM assembly for umac_nh_n.

author Niels Möller <nisse@lysator.liu.se>

Mon, 15 Apr 2013 12:16:59 +0000 (14:16 +0200)

committer Niels Möller <nisse@lysator.liu.se>

Mon, 15 Apr 2013 12:16:59 +0000 (14:16 +0200)
author Niels Möller <nisse@lysator.liu.se>
Mon, 15 Apr 2013 12:16:59 +0000 (14:16 +0200)
committer Niels Möller <nisse@lysator.liu.se>
Mon, 15 Apr 2013 12:16:59 +0000 (14:16 +0200)
diff --git a/ChangeLog b/ChangeLog

index 1d5a93f17fb66fd992f6c8a18df4c1456ad4e1a4..a1391ae6816e6a02358f171e6d0ec4656caa14c6 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,7 @@
  2013-04-15  Niels Möller  <nisse@lysator.liu.se>
  
+       * armv7/umac-nh-n.asm: New file. 2.0-2.3 time speedup.
+
         * testsuite/umac-test.c (test_align): Fixed memory leak.
  
  2013-04-12  Niels Möller  <nisse@lysator.liu.se>
diff --git a/armv7/umac-nh-n.asm b/armv7/umac-nh-n.asm

new file mode 100644 (file)

index 0000000..4ae876b
--- /dev/null
+++ b/armv7/umac-nh-n.asm
@@ -0,0 +1,298 @@
+C nettle, low-level cryptographics library
+C 
+C Copyright (C) 2013 Niels Möller
+C  
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C 
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C 
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+       .file "umac-nh.asm"
+       .fpu    neon
+
+define(<OUT>, <r0>)
+define(<ITERS>, <r1>)
+define(<KEY>, <r2>)
+define(<LENGTH>, <r3>)
+define(<MSG>, <r12>)
+define(<SHIFT>, <r14>)
+
+define(<QA>, <q0>)
+define(<QB>, <q1>)
+define(<QY0>, <q3>)    C Accumulates for the first two operations.
+define(<DM>, <d4>)
+define(<QY1>, <q4>)    C Used for 3 and 4 iterations.
+define(<QC>, <q5>)
+define(<QD>, <q6>)
+define(<QLEFT>, <q8>)
+define(<QRIGHT>, <q9>)
+define(<QT0>, <q10>)
+define(<QT1>, <q11>)
+define(<QT2>, <q12>)
+define(<QK0>, <q13>)
+define(<QK1>, <q14>)
+define(<QK2>, <q15>)
+
+C FIXME: Try permuting subkeys using vld4, vzip or similar.
+
+       .text
+       .align  3
+       
+PROLOGUE(_nettle_umac_nh_n)
+       ldr     MSG, [sp]
+       str     lr, [sp, #-4]!
+       
+       C Setup for 64-bit aligned reads
+       ands    SHIFT, MSG, #7
+       and     MSG, MSG, #-8
+       vld1.8  {DM}, [MSG :64]
+       addne   MSG, MSG, #8
+       addeq   SHIFT, SHIFT, #8
+
+       C FIXME: Combine as rsb ?
+       lsl     SHIFT, SHIFT, #3
+       neg     SHIFT, SHIFT
+
+       C Right shift in QRIGHT (both halves)
+       vmov.i32 D0REG(QRIGHT)[0], SHIFT
+       vmov.32  D1REG(QRIGHT), D0REG(QRIGHT)
+       add     SHIFT, SHIFT, #64
+       
+       vmov.i32 D0REG(QLEFT)[0], SHIFT
+       vmov.32  D1REG(QLEFT), D0REG(QLEFT)
+       cmp     r1, #3
+       vmov.i64 QY0, #0
+
+       vshl.u64 DM, DM, D0REG(QRIGHT)
+       bcc     .Lnh2
+       beq     .Lnh3
+       
+.Lnh4: 
+       C Permute key words, so we in each iteration have them in order
+       C
+       C P0: [0, 4,1, 5] P1: [ 2, 6, 3, 7] P2: [ 4, 8, 5, 9] P3: [ 6,10, 7,11]
+       C P4: [8,12,9,13] P5: [10,14,11,15] P6: [12,16,13,17] P7: [14,18,15,19]
+       C
+       C Also arrange the message words, so we get them as
+       C M0: [0,0,1,1] M1: [ 2, 2, 3, 3] M2: [ 4, 4, 5, 5] M3: [ 6, 6, 7, 7]
+       C M4: [8,8,9,9] M5: [10,10,11,11] M6: [12,12,13,13] M7: [14,14,15,15]
+       C
+       C Then, accumulate Y0 (first two "iters") using
+       C
+       C Y0 += (M0+P0) * (M2+P2) + (M1+P1) * (M3+P3) 
+       C Y1 += (M0+P4) * (M2+P6) + (M1+P5) * (M3+P7)
+       C
+       C Next iteration is then
+       C
+       C Y0 += (M4+P4) * (M6+P6) + (M5+P5) * (M7 + P7) 
+       C Y1 += (M4+P6) * (M6+P8) + (M5+P7) * (M7 + P11)
+       C
+       C So we can reuse P4, P5, P6, P7 from the previous iteration.
+
+       C How to for in registers? We need 4 Q regs for P0-P3, and one
+       C more for the last read key. We need at least two regiters
+       C for the message (QA and QB, more if we want to expand only
+       C once). For the Y0 update, we can let the factors overwrite
+       C P0-P3, and for the Y1 update, we can overwrite M0-M3.
+       
+       vpush   {q4,q5,q6}
+       vld1.32 {QK0,QK1}, [KEY]!
+       vld1.32 {QK2}, [KEY]!
+       vmov    QT0, QK1
+       vmov    QT1, QK2
+       
+       C Permute keys. QK2 us untouched, permuted subkeys put in QK0,QK1,QT0,QT1
+       vtrn.32 QK0, QK1                C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
+       vswp D1REG(QK0), D0REG(QK1)     C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
+       vtrn.32 QT0, QT1                C Gives us [4,8,6,10] and [5 ,9,7,11]
+       vswp D1REG(QT0), D0REG(QT1)     C Gives us [4,8,5, 9] and [6,10,7,11]
+
+       vmov.i64 QY1, #0
+.Loop4:
+       C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
+       vld1.8 {QA, QB}, [MSG :64]!
+       vshl.u64 QC, QA, QRIGHT
+       vshl.u64 QD, QB, QRIGHT
+       vshl.u64 QA, QA, QLEFT
+       vshl.u64 QB, QB, QLEFT
+       veor    D0REG(QA), D0REG(QA), DM
+       veor    D1REG(QA), D1REG(QA), D0REG(QC)
+       veor    D0REG(QB), D0REG(QB), D1REG(QC)
+       veor    D1REG(QB), D1REG(QB), D0REG(QD)
+       vmov    DM, D1REG(QD)
+
+       C Explode message (too bad there's no vadd with scalar)
+       vdup.32 D1REG(QD), D1REG(QB)[1]
+       vdup.32 D0REG(QD), D1REG(QB)[0]
+       vdup.32 D1REG(QC), D0REG(QB)[1]
+       vdup.32 D0REG(QC), D0REG(QB)[0]
+       vdup.32 D1REG(QB), D1REG(QA)[1]
+       vdup.32 D0REG(QB), D1REG(QA)[0]
+       vdup.32 D1REG(QA), D0REG(QA)[1]
+       vdup.32 D0REG(QA), D0REG(QA)[0]
+
+       vadd.i32 QK0, QK0, QA
+       vadd.i32 QK1, QK1, QB
+       vadd.i32 QT0, QT0, QC
+       vadd.i32 QT1, QT1, QD
+
+       vmlal.u32 QY0, D0REG(QK0), D0REG(QT0)
+       vmlal.u32 QY0, D1REG(QK0), D1REG(QT0)
+       vmlal.u32 QY0, D0REG(QK1), D0REG(QT1)
+       vmlal.u32 QY0, D1REG(QK1), D1REG(QT1)
+       
+       C Next 4 subkeys
+       vld1.32 {QT0,QT1}, [KEY]!
+       vmov    QK0, QK2
+       vmov    QK1, QT0
+       vmov    QK2, QT1                C Save
+       vtrn.32 QK0, QK1                C Gives us [8,12,10,14] and [9,13,11,15]
+       vswp D1REG(QK0), D0REG(QK1)     C Gives us [8,12,9,13] and [10,14,11,15]
+       vtrn.32 QT0, QT1                C Gives us [12,16,14,18] and [13,17,15,19]
+       vswp D1REG(QT0), D0REG(QT1)     C Gives us [12,16,13,17] and [14,18,15,19]
+
+       vadd.i32 QA, QA, QK0
+       vadd.i32 QB, QB, QK1
+       vadd.i32 QC, QC, QT0
+       vadd.i32 QD, QD, QT1
+
+       subs    LENGTH, LENGTH, #32
+
+       vmlal.u32 QY1, D0REG(QA), D0REG(QC)
+       vmlal.u32 QY1, D1REG(QA), D1REG(QC)
+       vmlal.u32 QY1, D0REG(QB), D0REG(QD)
+       vmlal.u32 QY1, D1REG(QB), D1REG(QD)
+
+       bhi     .Loop4
+
+       vst1.64 {QY0, QY1}, [OUT]
+       
+       vpop    {q4,q5,q6}
+       
+       ldr     pc, [sp], #+4
+
+.Lnh3:
+       vpush   {q4}
+       vld1.32 {QK0,QK1}, [KEY]!
+       vmov.i64 QY1, #0
+.Loop3:
+       C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
+       vld1.8 {QA, QB}, [MSG :64]!
+       vshl.u64 QT0, QA, QRIGHT
+       vshl.u64 QT1, QB, QRIGHT
+       vshl.u64 QA, QA, QLEFT
+       vshl.u64 QB, QB, QLEFT
+       veor    D0REG(QA), D0REG(QA), DM
+       veor    D1REG(QA), D1REG(QA), D0REG(QT0)
+       veor    D0REG(QB), D0REG(QB), D1REG(QT0)
+       veor    D1REG(QB), D1REG(QB), D0REG(QT1)
+       vmov    DM, D1REG(QT1)
+       
+       vld1.32 {QK2}, [KEY]!
+       C Construct factors, with low half corresponding to first iteration,
+       C and high half corresponding to the second iteration.
+       vmov    QT0, QK1
+       vtrn.32 QK0, QT0                C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
+       vswp D1REG(QK0), D0REG(QT0)     C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
+       vdup.32 D0REG(QT1), D0REG(QA)[0]
+       vdup.32 D1REG(QT1), D0REG(QA)[1]
+       vadd.i32        QT1, QT1, QK0
+
+       vmov    QK0, QK2                C Save for next iteration
+       vtrn.32 QK1, QK2                C Gives us [4, 8, 2, 1] and [1, 5, 3, 7]
+       vswp    D1REG(QK1), D0REG(QK2)  C Gives us [4, 8, 1, 5] and [2, 1, 3, 7]
+       
+       vdup.32 D0REG(QT2), D0REG(QB)[0]
+       vdup.32 D1REG(QT2), D0REG(QB)[1]
+       vadd.i32 QK1, QK1, QT2
+       vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
+       vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
+
+       vdup.32 D0REG(QT1), D1REG(QA)[0]
+       vdup.32 D1REG(QT1), D1REG(QA)[1]
+       vadd.i32        QT0, QT0, QT1
+       vdup.32 D0REG(QT1), D1REG(QB)[0]
+       vdup.32 D1REG(QT1), D1REG(QB)[1]
+       vadd.i32        QK2, QK2, QT1
+
+       vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
+       vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
+
+       vld1.32 {QK1}, [KEY]!
+       vadd.i32 QA, QA, QK0
+       vadd.i32 QB, QB, QK1
+       subs    LENGTH, LENGTH, #32
+       vmlal.u32 QY1, D0REG(QA), D0REG(QB)
+       vmlal.u32 QY1, D1REG(QA), D1REG(QB)
+       bhi     .Loop3
+
+       vadd.i64 D0REG(QY1), D0REG(QY1), D1REG(QY1)
+       vst1.64 {D0REG(QY0), D1REG(QY0), D0REG(QY1)}, [OUT]
+       
+       vpop    {q4}
+       
+       ldr     pc, [sp], #+4
+       
+.Lnh2:
+       vld1.32 {QK0}, [KEY]!
+.Loop2:
+       C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
+       vld1.8 {QA, QB}, [MSG :64]!
+       vshl.u64 QT0, QA, QRIGHT
+       vshl.u64 QT1, QB, QRIGHT
+       vshl.u64 QA, QA, QLEFT
+       vshl.u64 QB, QB, QLEFT
+       veor    D0REG(QA), D0REG(QA), DM
+       veor    D1REG(QA), D1REG(QA), D0REG(QT0)
+       veor    D0REG(QB), D0REG(QB), D1REG(QT0)
+       veor    D1REG(QB), D1REG(QB), D0REG(QT1)
+       vmov    DM, D1REG(QT1)
+       
+       vld1.32 {QK1,QK2}, [KEY]!
+       C Construct factors, with low half corresponding to first iteration,
+       C and high half corresponding to the second iteration.
+       vmov    QT0, QK1
+       vtrn.32 QK0, QT0                C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
+       vswp D1REG(QK0), D0REG(QT0)     C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
+       vdup.32 D0REG(QT1), D0REG(QA)[0]
+       vdup.32 D1REG(QT1), D0REG(QA)[1]
+       vadd.i32        QT1, QT1, QK0
+
+       vmov    QK0, QK2                C Save for next iteration
+       vtrn.32 QK1, QK2                C Gives us [4, 8, 6, 10] and [5,  9, 7, 11]
+       vswp    D1REG(QK1), D0REG(QK2)  C Gives us [4, 8, 5,  9] and [6, 10, 7, 11]
+       
+       vdup.32 D0REG(QT2), D0REG(QB)[0]
+       vdup.32 D1REG(QT2), D0REG(QB)[1]
+       vadd.i32 QK1, QK1, QT2
+       vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
+       vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
+
+       vdup.32 D0REG(QT1), D1REG(QA)[0]
+       vdup.32 D1REG(QT1), D1REG(QA)[1]
+       vadd.i32        QT0, QT0, QT1
+       vdup.32 D0REG(QT1), D1REG(QB)[0]
+       vdup.32 D1REG(QT1), D1REG(QB)[1]
+       vadd.i32        QK2, QK2, QT1
+
+       subs    LENGTH, LENGTH, #32
+       
+       vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
+       vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
+       
+       bhi     .Loop2
+       vst1.64 {QY0}, [OUT]
+
+.Lend:
+       ldr     pc, [sp], #+4
+EPILOGUE(_nettle_umac_nh_n)
author	Niels Möller <nisse@lysator.liu.se>
	Mon, 15 Apr 2013 12:16:59 +0000 (14:16 +0200)
committer	Niels Möller <nisse@lysator.liu.se>
	Mon, 15 Apr 2013 12:16:59 +0000 (14:16 +0200)
ChangeLog		patch \| blob \| blame \| history
armv7/umac-nh-n.asm	[new file with mode: 0644]	patch \| blob