From: Niels Möller Date: Mon, 15 Apr 2013 12:16:59 +0000 (+0200) Subject: ARM assembly for umac_nh_n. X-Git-Tag: nettle_2.7_release_20130424~52 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=38d507fa1327feb3182a2b5f8de7d9583f9566e1;p=thirdparty%2Fnettle.git ARM assembly for umac_nh_n. --- diff --git a/ChangeLog b/ChangeLog index 1d5a93f1..a1391ae6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ 2013-04-15 Niels Möller + * armv7/umac-nh-n.asm: New file. 2.0-2.3 time speedup. + * testsuite/umac-test.c (test_align): Fixed memory leak. 2013-04-12 Niels Möller diff --git a/armv7/umac-nh-n.asm b/armv7/umac-nh-n.asm new file mode 100644 index 00000000..4ae876b5 --- /dev/null +++ b/armv7/umac-nh-n.asm @@ -0,0 +1,298 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "umac-nh.asm" + .fpu neon + +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) + +define(, ) +define(, ) +define(, ) C Accumulates for the first two operations. +define(, ) +define(, ) C Used for 3 and 4 iterations. +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) + +C FIXME: Try permuting subkeys using vld4, vzip or similar. + + .text + .align 3 + +PROLOGUE(_nettle_umac_nh_n) + ldr MSG, [sp] + str lr, [sp, #-4]! + + C Setup for 64-bit aligned reads + ands SHIFT, MSG, #7 + and MSG, MSG, #-8 + vld1.8 {DM}, [MSG :64] + addne MSG, MSG, #8 + addeq SHIFT, SHIFT, #8 + + C FIXME: Combine as rsb ? + lsl SHIFT, SHIFT, #3 + neg SHIFT, SHIFT + + C Right shift in QRIGHT (both halves) + vmov.i32 D0REG(QRIGHT)[0], SHIFT + vmov.32 D1REG(QRIGHT), D0REG(QRIGHT) + add SHIFT, SHIFT, #64 + + vmov.i32 D0REG(QLEFT)[0], SHIFT + vmov.32 D1REG(QLEFT), D0REG(QLEFT) + cmp r1, #3 + vmov.i64 QY0, #0 + + vshl.u64 DM, DM, D0REG(QRIGHT) + bcc .Lnh2 + beq .Lnh3 + +.Lnh4: + C Permute key words, so we in each iteration have them in order + C + C P0: [0, 4,1, 5] P1: [ 2, 6, 3, 7] P2: [ 4, 8, 5, 9] P3: [ 6,10, 7,11] + C P4: [8,12,9,13] P5: [10,14,11,15] P6: [12,16,13,17] P7: [14,18,15,19] + C + C Also arrange the message words, so we get them as + C M0: [0,0,1,1] M1: [ 2, 2, 3, 3] M2: [ 4, 4, 5, 5] M3: [ 6, 6, 7, 7] + C M4: [8,8,9,9] M5: [10,10,11,11] M6: [12,12,13,13] M7: [14,14,15,15] + C + C Then, accumulate Y0 (first two "iters") using + C + C Y0 += (M0+P0) * (M2+P2) + (M1+P1) * (M3+P3) + C Y1 += (M0+P4) * (M2+P6) + (M1+P5) * (M3+P7) + C + C Next iteration is then + C + C Y0 += (M4+P4) * (M6+P6) + (M5+P5) * (M7 + P7) + C Y1 += (M4+P6) * (M6+P8) + (M5+P7) * (M7 + P11) + C + C So we can reuse P4, P5, P6, P7 from the previous iteration. + + C How to for in registers? We need 4 Q regs for P0-P3, and one + C more for the last read key. We need at least two regiters + C for the message (QA and QB, more if we want to expand only + C once). For the Y0 update, we can let the factors overwrite + C P0-P3, and for the Y1 update, we can overwrite M0-M3. + + vpush {q4,q5,q6} + vld1.32 {QK0,QK1}, [KEY]! + vld1.32 {QK2}, [KEY]! + vmov QT0, QK1 + vmov QT1, QK2 + + C Permute keys. QK2 us untouched, permuted subkeys put in QK0,QK1,QT0,QT1 + vtrn.32 QK0, QK1 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7] + vswp D1REG(QK0), D0REG(QK1) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7] + vtrn.32 QT0, QT1 C Gives us [4,8,6,10] and [5 ,9,7,11] + vswp D1REG(QT0), D0REG(QT1) C Gives us [4,8,5, 9] and [6,10,7,11] + + vmov.i64 QY1, #0 +.Loop4: + C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT + vld1.8 {QA, QB}, [MSG :64]! + vshl.u64 QC, QA, QRIGHT + vshl.u64 QD, QB, QRIGHT + vshl.u64 QA, QA, QLEFT + vshl.u64 QB, QB, QLEFT + veor D0REG(QA), D0REG(QA), DM + veor D1REG(QA), D1REG(QA), D0REG(QC) + veor D0REG(QB), D0REG(QB), D1REG(QC) + veor D1REG(QB), D1REG(QB), D0REG(QD) + vmov DM, D1REG(QD) + + C Explode message (too bad there's no vadd with scalar) + vdup.32 D1REG(QD), D1REG(QB)[1] + vdup.32 D0REG(QD), D1REG(QB)[0] + vdup.32 D1REG(QC), D0REG(QB)[1] + vdup.32 D0REG(QC), D0REG(QB)[0] + vdup.32 D1REG(QB), D1REG(QA)[1] + vdup.32 D0REG(QB), D1REG(QA)[0] + vdup.32 D1REG(QA), D0REG(QA)[1] + vdup.32 D0REG(QA), D0REG(QA)[0] + + vadd.i32 QK0, QK0, QA + vadd.i32 QK1, QK1, QB + vadd.i32 QT0, QT0, QC + vadd.i32 QT1, QT1, QD + + vmlal.u32 QY0, D0REG(QK0), D0REG(QT0) + vmlal.u32 QY0, D1REG(QK0), D1REG(QT0) + vmlal.u32 QY0, D0REG(QK1), D0REG(QT1) + vmlal.u32 QY0, D1REG(QK1), D1REG(QT1) + + C Next 4 subkeys + vld1.32 {QT0,QT1}, [KEY]! + vmov QK0, QK2 + vmov QK1, QT0 + vmov QK2, QT1 C Save + vtrn.32 QK0, QK1 C Gives us [8,12,10,14] and [9,13,11,15] + vswp D1REG(QK0), D0REG(QK1) C Gives us [8,12,9,13] and [10,14,11,15] + vtrn.32 QT0, QT1 C Gives us [12,16,14,18] and [13,17,15,19] + vswp D1REG(QT0), D0REG(QT1) C Gives us [12,16,13,17] and [14,18,15,19] + + vadd.i32 QA, QA, QK0 + vadd.i32 QB, QB, QK1 + vadd.i32 QC, QC, QT0 + vadd.i32 QD, QD, QT1 + + subs LENGTH, LENGTH, #32 + + vmlal.u32 QY1, D0REG(QA), D0REG(QC) + vmlal.u32 QY1, D1REG(QA), D1REG(QC) + vmlal.u32 QY1, D0REG(QB), D0REG(QD) + vmlal.u32 QY1, D1REG(QB), D1REG(QD) + + bhi .Loop4 + + vst1.64 {QY0, QY1}, [OUT] + + vpop {q4,q5,q6} + + ldr pc, [sp], #+4 + +.Lnh3: + vpush {q4} + vld1.32 {QK0,QK1}, [KEY]! + vmov.i64 QY1, #0 +.Loop3: + C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT + vld1.8 {QA, QB}, [MSG :64]! + vshl.u64 QT0, QA, QRIGHT + vshl.u64 QT1, QB, QRIGHT + vshl.u64 QA, QA, QLEFT + vshl.u64 QB, QB, QLEFT + veor D0REG(QA), D0REG(QA), DM + veor D1REG(QA), D1REG(QA), D0REG(QT0) + veor D0REG(QB), D0REG(QB), D1REG(QT0) + veor D1REG(QB), D1REG(QB), D0REG(QT1) + vmov DM, D1REG(QT1) + + vld1.32 {QK2}, [KEY]! + C Construct factors, with low half corresponding to first iteration, + C and high half corresponding to the second iteration. + vmov QT0, QK1 + vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7] + vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7] + vdup.32 D0REG(QT1), D0REG(QA)[0] + vdup.32 D1REG(QT1), D0REG(QA)[1] + vadd.i32 QT1, QT1, QK0 + + vmov QK0, QK2 C Save for next iteration + vtrn.32 QK1, QK2 C Gives us [4, 8, 2, 1] and [1, 5, 3, 7] + vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 1, 5] and [2, 1, 3, 7] + + vdup.32 D0REG(QT2), D0REG(QB)[0] + vdup.32 D1REG(QT2), D0REG(QB)[1] + vadd.i32 QK1, QK1, QT2 + vmlal.u32 QY0, D0REG(QT1), D0REG(QK1) + vmlal.u32 QY0, D1REG(QT1), D1REG(QK1) + + vdup.32 D0REG(QT1), D1REG(QA)[0] + vdup.32 D1REG(QT1), D1REG(QA)[1] + vadd.i32 QT0, QT0, QT1 + vdup.32 D0REG(QT1), D1REG(QB)[0] + vdup.32 D1REG(QT1), D1REG(QB)[1] + vadd.i32 QK2, QK2, QT1 + + vmlal.u32 QY0, D0REG(QT0), D0REG(QK2) + vmlal.u32 QY0, D1REG(QT0), D1REG(QK2) + + vld1.32 {QK1}, [KEY]! + vadd.i32 QA, QA, QK0 + vadd.i32 QB, QB, QK1 + subs LENGTH, LENGTH, #32 + vmlal.u32 QY1, D0REG(QA), D0REG(QB) + vmlal.u32 QY1, D1REG(QA), D1REG(QB) + bhi .Loop3 + + vadd.i64 D0REG(QY1), D0REG(QY1), D1REG(QY1) + vst1.64 {D0REG(QY0), D1REG(QY0), D0REG(QY1)}, [OUT] + + vpop {q4} + + ldr pc, [sp], #+4 + +.Lnh2: + vld1.32 {QK0}, [KEY]! +.Loop2: + C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT + vld1.8 {QA, QB}, [MSG :64]! + vshl.u64 QT0, QA, QRIGHT + vshl.u64 QT1, QB, QRIGHT + vshl.u64 QA, QA, QLEFT + vshl.u64 QB, QB, QLEFT + veor D0REG(QA), D0REG(QA), DM + veor D1REG(QA), D1REG(QA), D0REG(QT0) + veor D0REG(QB), D0REG(QB), D1REG(QT0) + veor D1REG(QB), D1REG(QB), D0REG(QT1) + vmov DM, D1REG(QT1) + + vld1.32 {QK1,QK2}, [KEY]! + C Construct factors, with low half corresponding to first iteration, + C and high half corresponding to the second iteration. + vmov QT0, QK1 + vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7] + vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7] + vdup.32 D0REG(QT1), D0REG(QA)[0] + vdup.32 D1REG(QT1), D0REG(QA)[1] + vadd.i32 QT1, QT1, QK0 + + vmov QK0, QK2 C Save for next iteration + vtrn.32 QK1, QK2 C Gives us [4, 8, 6, 10] and [5, 9, 7, 11] + vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 5, 9] and [6, 10, 7, 11] + + vdup.32 D0REG(QT2), D0REG(QB)[0] + vdup.32 D1REG(QT2), D0REG(QB)[1] + vadd.i32 QK1, QK1, QT2 + vmlal.u32 QY0, D0REG(QT1), D0REG(QK1) + vmlal.u32 QY0, D1REG(QT1), D1REG(QK1) + + vdup.32 D0REG(QT1), D1REG(QA)[0] + vdup.32 D1REG(QT1), D1REG(QA)[1] + vadd.i32 QT0, QT0, QT1 + vdup.32 D0REG(QT1), D1REG(QB)[0] + vdup.32 D1REG(QT1), D1REG(QB)[1] + vadd.i32 QK2, QK2, QT1 + + subs LENGTH, LENGTH, #32 + + vmlal.u32 QY0, D0REG(QT0), D0REG(QK2) + vmlal.u32 QY0, D1REG(QT0), D1REG(QK2) + + bhi .Loop2 + vst1.64 {QY0}, [OUT] + +.Lend: + ldr pc, [sp], #+4 +EPILOGUE(_nettle_umac_nh_n)