[Arm64] Optimize Chacha20

author Mamone Tarsha <maamoun.tk@googlemail.com>

Tue, 18 Jan 2022 17:27:32 +0000 (19:27 +0200)

committer Mamone Tarsha <maamoun.tk@googlemail.com>

Tue, 18 Jan 2022 17:27:32 +0000 (19:27 +0200)
author Mamone Tarsha <maamoun.tk@googlemail.com>
Tue, 18 Jan 2022 17:27:32 +0000 (19:27 +0200)
committer Mamone Tarsha <maamoun.tk@googlemail.com>
Tue, 18 Jan 2022 17:27:32 +0000 (19:27 +0200)
diff --git a/Makefile.in b/Makefile.in

index 0590c370b26f1b340452c6458b7f1f166107ee10..7c87ca652e0aa5be9285c1ea37687a72b51f94eb 100644 (file)
--- a/Makefile.in
+++ b/Makefile.in
@@ -606,7 +606,7 @@ distdir: $(DISTFILES)
         set -e; for d in sparc32 sparc64 x86 \
                 x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \
                 arm arm/neon arm/v6 arm/fat \
-               arm64 arm64/crypto arm64/fat \
+               arm64 arm64/asimd arm64/crypto arm64/fat \
                 powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat \
                 s390x s390x/vf s390x/msa s390x/msa_x1 s390x/msa_x2 s390x/msa_x4 s390x/fat ; do \
           mkdir "$(distdir)/$$d" ; \
diff --git a/arm64/asimd/chacha-2core.asm b/arm64/asimd/chacha-2core.asm

new file mode 100644 (file)

index 0000000..792d2c4
--- /dev/null
+++ b/arm64/asimd/chacha-2core.asm
@@ -0,0 +1,231 @@
+C arm64/asimd/chacha-2core.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+   Copyright (C) 2022 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `x0')
+define(`SRC', `x1')
+define(`ROUNDS', `x2')
+
+C Working state
+
+define(`ROT24', `v0')
+
+define(`T0', `v16')
+
+C State, even elements in X, odd elements in Y
+define(`X0', `v17')
+define(`X1', `v18')
+define(`X2', `v19')
+define(`X3', `v20')
+define(`Y0', `v21')
+define(`Y1', `v22')
+define(`Y2', `v23')
+define(`Y3', `v24')
+
+C Original input state
+define(`S0', `v25')
+define(`S1', `v26')
+define(`S2', `v27')
+define(`S3', `v28')
+define(`S3p1', `v29')
+
+define(`TMP0', `v30')
+define(`TMP1', `v31')
+
+       C _chacha_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_chacha_2core)
+
+       eor             X1.16b, X1.16b, X1.16b
+       mov             w3, #1
+       mov             X1.s[0], w3
+
+       add             x3, SRC, #48
+       ld1             {X3.4s}, [x3]
+
+       add     Y3.4s, X3.4s, X1.4s
+       cmhi    Y3.4s, X3.4s, Y3.4s
+       ext             Y3.16b, Y3.16b, Y3.16b, #12
+       orr             Y3.16b, Y3.16b, X1.16b
+
+.Lshared_entry:
+       adr             x3, .Lrot24
+       ld1             {ROT24.4s},[x3]
+
+       add     Y3.4s, Y3.4s, X3.4s
+
+C Load state
+       ld1             {X0.4s,X1.4s,X2.4s}, [SRC]
+
+       mov             S0.16b, X0.16b
+       mov             S1.16b, X1.16b
+       mov             S2.16b, X2.16b
+       mov             S3.16b, X3.16b
+       mov             S3p1.16b, Y3.16b
+
+       trn2    Y0.4s, X0.4s, X0.4s     C  1  1  3  3
+       trn1    X0.4s, X0.4s, X0.4s     C  0  0  2  2
+       trn2    Y1.4s, X1.4s, X1.4s     C  5  5  7  7
+       trn1    X1.4s, X1.4s, X1.4s     C  4  4  6  6
+       trn2    Y2.4s, X2.4s, X2.4s     C  9  9 11 11
+       trn1    X2.4s, X2.4s, X2.4s     C  8  8 10 10
+       trn2    Y3.4s, X3.4s, S3p1.4s   C  13 13 15 15
+       trn1    X3.4s, X3.4s, S3p1.4s   C  12 12 14 14
+
+.Loop:
+C Register layout (A is first block, B is second block)
+C
+C X0:  A0  B0  A2  B2  Y0:  A1  B1  A3  B3
+C X1:  A4  B4  A6  B6  Y1:  A5  B5  A7  B7
+C X2:  A8  B8 A10 B10  Y2:  A9  B9 A11 B11
+C X3: A12 B12 A14 B14  Y3: A13 B13 A15 B15
+       add             X0.4s, X0.4s, X1.4s
+        add    Y0.4s, Y0.4s, Y1.4s
+       eor             X3.16b, X3.16b, X0.16b
+        eor    Y3.16b, Y3.16b, Y0.16b
+       rev32   X3.8h, X3.8h
+        rev32  Y3.8h, Y3.8h
+       
+       add             X2.4s, X2.4s, X3.4s
+        add    Y2.4s, Y2.4s, Y3.4s
+       eor             TMP0.16b, X1.16b, X2.16b
+        eor    TMP1.16b, Y1.16b, Y2.16b
+       ushr    X1.4s, TMP0.4s, #20
+        ushr   Y1.4s, TMP1.4s, #20
+       sli             X1.4s, TMP0.4s, #12
+        sli    Y1.4s, TMP1.4s, #12
+       
+       add             X0.4s, X0.4s, X1.4s
+        add    Y0.4s, Y0.4s, Y1.4s
+       eor             X3.16b, X3.16b, X0.16b
+        eor    Y3.16b, Y3.16b, Y0.16b
+       tbl             X3.16b, {X3.16b}, ROT24.16b
+        tbl    Y3.16b, {Y3.16b}, ROT24.16b
+       
+       add             X2.4s, X2.4s, X3.4s
+        add    Y2.4s, Y2.4s, Y3.4s
+       eor             TMP0.16b, X1.16b, X2.16b
+        eor    TMP1.16b, Y1.16b, Y2.16b
+       ushr    X1.4s, TMP0.4s, #25
+        ushr   Y1.4s, TMP1.4s, #25
+       sli             X1.4s, TMP0.4s, #7
+        sli    Y1.4s, TMP1.4s, #7
+
+       ext             X1.16b, X1.16b, X1.16b, #8
+       ext             X2.16b, X2.16b, X2.16b, #8
+       ext             Y2.16b, Y2.16b, Y2.16b, #8
+       ext             Y3.16b, Y3.16b, Y3.16b, #8
+
+C Register layout:
+C X0:  A0  B0  A2  B2  Y0:  A1  B1  A3  B3
+C Y1:  A5  B5  A7  B7  X1:  A6  B6  A4  B4 (X1 swapped)
+C X2: A10 B10  A8  B8  Y2: A11 A11  A9  B9 (X2, Y2 swapped)
+C Y3  A15 B15 A13 B13  X3  A12 B12 A14 B14 (Y3 swapped)
+
+       add             X0.4s, X0.4s, Y1.4s
+        add    Y0.4s, Y0.4s, X1.4s
+       eor             Y3.16b, Y3.16b, X0.16b
+        eor    X3.16b, X3.16b, Y0.16b
+       rev32   Y3.8h, Y3.8h
+        rev32  X3.8h, X3.8h
+       
+       add             X2.4s, X2.4s, Y3.4s
+        add    Y2.4s, Y2.4s, X3.4s
+       eor             TMP0.16b, Y1.16b, X2.16b
+        eor    TMP1.16b, X1.16b, Y2.16b
+       ushr    Y1.4s, TMP0.4s, #20
+        ushr   X1.4s, TMP1.4s, #20
+       sli             Y1.4s, TMP0.4s, #12
+        sli    X1.4s, TMP1.4s, #12
+       
+       add             X0.4s, X0.4s, Y1.4s
+        add    Y0.4s, Y0.4s, X1.4s
+       eor             Y3.16b, Y3.16b, X0.16b
+        eor    X3.16b, X3.16b, Y0.16b
+       tbl             Y3.16b, {Y3.16b}, ROT24.16b
+        tbl    X3.16b, {X3.16b}, ROT24.16b
+       
+       add             X2.4s, X2.4s, Y3.4s
+        add    Y2.4s, Y2.4s, X3.4s
+       eor             TMP0.16b, Y1.16b, X2.16b
+        eor    TMP1.16b, X1.16b, Y2.16b
+       ushr    Y1.4s, TMP0.4s, #25
+        ushr   X1.4s, TMP1.4s, #25
+       sli             Y1.4s, TMP0.4s, #7
+        sli    X1.4s, TMP1.4s, #7
+
+       ext             X1.16b, X1.16b, X1.16b, #8
+       ext             X2.16b, X2.16b, X2.16b, #8
+       ext             Y2.16b, Y2.16b, Y2.16b, #8
+       ext             Y3.16b, Y3.16b, Y3.16b, #8
+
+       subs    ROUNDS, ROUNDS, #2
+       b.ne    .Loop
+
+       trn1    T0.4s, X0.4s, Y0.4s
+       trn2    Y0.4s, X0.4s, Y0.4s
+
+       trn1    X0.4s, X1.4s, Y1.4s
+       trn2    Y1.4s, X1.4s, Y1.4s
+
+       trn1    X1.4s, X2.4s, Y2.4s
+       trn2    Y2.4s, X2.4s, Y2.4s
+
+       trn1    X2.4s, X3.4s, Y3.4s
+       trn2    Y3.4s, X3.4s, Y3.4s
+
+       add             T0.4s, T0.4s, S0.4s
+       add             Y0.4s, Y0.4s, S0.4s
+       add             X0.4s, X0.4s, S1.4s
+       add             Y1.4s, Y1.4s, S1.4s
+       add             X1.4s, X1.4s, S2.4s
+       add             Y2.4s, Y2.4s, S2.4s
+       add             X2.4s, X2.4s, S3.4s
+       add             Y3.4s, Y3.4s, S3p1.4s
+
+       st1             {T0.16b,X0.16b,X1.16b,X2.16b}, [DST], #64
+       st1             {Y0.16b,Y1.16b,Y2.16b,Y3.16b}, [DST]
+       ret
+EPILOGUE(_nettle_chacha_2core)
+
+PROLOGUE(_nettle_chacha_2core32)
+       eor             Y3.16b, Y3.16b, Y3.16b  C {0,0,...,0}
+       mov             w3, #1
+       mov             Y3.s[0], w3     C {1,0,...,0}
+       add             x3, SRC, #48
+       ld1             {X3.4s}, [x3]
+       b               .Lshared_entry
+EPILOGUE(_nettle_chacha_2core32)
+
+.align 4
+.Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
diff --git a/arm64/asimd/chacha-4core.asm b/arm64/asimd/chacha-4core.asm

new file mode 100644 (file)

index 0000000..5690e54
--- /dev/null
+++ b/arm64/asimd/chacha-4core.asm
@@ -0,0 +1,228 @@
+C arm64/asimd/chacha-4core.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+   Copyright (C) 2022 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `x0')
+define(`SRC', `x1')
+define(`ROUNDS', `x2')
+
+C Working state
+
+C During the loop, used to save the original values for last 4 words
+C of each block. Also used as temporaries for transpose.
+define(`T0', `v0')
+define(`T1', `v1')
+define(`T2', `v2')
+define(`T3', `v3')
+
+define(`TMP0', `v4')
+define(`TMP1', `v5')
+define(`TMP2', `v6')
+define(`TMP3', `v7')
+
+define(`ROT24', `v8')
+
+C Main loop for round
+define(`QR',`
+       add             $1.4s, $1.4s, $2.4s
+       add             $5.4s, $5.4s, $6.4s
+       add             $9.4s, $9.4s, $10.4s
+       add             $13.4s, $13.4s, $14.4s
+       eor             $4.16b, $4.16b, $1.16b
+       eor             $8.16b, $8.16b, $5.16b
+       eor             $12.16b, $12.16b, $9.16b
+       eor             $16.16b, $16.16b, $13.16b
+       rev32   $4.8h, $4.8h
+       rev32   $8.8h, $8.8h
+       rev32   $12.8h, $12.8h
+       rev32   $16.8h, $16.8h
+
+       add             $3.4s, $3.4s, $4.4s
+       add             $7.4s, $7.4s, $8.4s
+       add             $11.4s, $11.4s, $12.4s
+       add             $15.4s, $15.4s, $16.4s
+       eor             TMP0.16b, $2.16b, $3.16b
+       eor             TMP1.16b, $6.16b, $7.16b
+       eor             TMP2.16b, $10.16b, $11.16b
+       eor             TMP3.16b, $14.16b, $15.16b
+       ushr    $2.4s, TMP0.4s, #20
+       ushr    $6.4s, TMP1.4s, #20
+       ushr    $10.4s, TMP2.4s, #20
+       ushr    $14.4s, TMP3.4s, #20
+       sli             $2.4s, TMP0.4s, #12
+       sli             $6.4s, TMP1.4s, #12
+       sli             $10.4s, TMP2.4s, #12
+       sli             $14.4s, TMP3.4s, #12
+
+       add             $1.4s, $1.4s, $2.4s
+       add             $5.4s, $5.4s, $6.4s
+       add             $9.4s, $9.4s, $10.4s
+       add             $13.4s, $13.4s, $14.4s
+       eor             $4.16b, $4.16b, $1.16b
+       eor             $8.16b, $8.16b, $5.16b
+       eor             $12.16b, $12.16b, $9.16b
+       eor             $16.16b, $16.16b, $13.16b
+       tbl             $4.16b, {$4.16b}, ROT24.16b
+       tbl             $8.16b, {$8.16b}, ROT24.16b
+       tbl             $12.16b, {$12.16b}, ROT24.16b
+       tbl             $16.16b, {$16.16b}, ROT24.16b
+
+       add             $3.4s, $3.4s, $4.4s
+       add             $7.4s, $7.4s, $8.4s
+       add             $11.4s, $11.4s, $12.4s
+       add             $15.4s, $15.4s, $16.4s
+       eor             TMP0.16b, $2.16b, $3.16b
+       eor             TMP1.16b, $6.16b, $7.16b
+       eor             TMP2.16b, $10.16b, $11.16b
+       eor             TMP3.16b, $14.16b, $15.16b
+       ushr    $2.4s, TMP0.4s, #25
+       ushr    $6.4s, TMP1.4s, #25
+       ushr    $10.4s, TMP2.4s, #25
+       ushr    $14.4s, TMP3.4s, #25
+       sli             $2.4s, TMP0.4s, #7
+       sli             $6.4s, TMP1.4s, #7
+       sli             $10.4s, TMP2.4s, #7
+       sli             $14.4s, TMP3.4s, #7
+')
+
+define(`TRANSPOSE',`
+       zip1    T0.4s, $1.4s, $3.4s             C A0 A2 B0 B2
+       zip1    T1.4s, $2.4s, $4.4s             C A1 A3 B1 B3
+       zip2    T2.4s, $1.4s, $3.4s             C C0 C2 D0 D2
+       zip2    T3.4s, $2.4s, $4.4s             C C1 C3 D1 D3
+
+       zip1    $1.4s, T0.4s, T1.4s             C A0 A1 A2 A3
+       zip2    $2.4s, T0.4s, T1.4s             C B0 B1 B2 B3
+       zip1    $3.4s, T2.4s, T3.4s             C C0 C2 C1 C3
+       zip2    $4.4s, T2.4s, T3.4s             C D0 D1 D2 D3
+')
+
+       C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_chacha_4core)
+
+       mov             w3, #1
+       dup             TMP2.4s, w3     C Apply counter carries
+
+.Lshared_entry:
+
+       C Save callee-save registers
+       fmov    x3, d8
+
+       adr             x4, .Lcnts
+       ld1             {TMP3.4s,ROT24.4s},[x4]
+
+C Load state and splat
+       ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [SRC]
+
+       dup             v20.4s, v16.s[1]
+       dup             v24.4s, v16.s[2]
+       dup             v28.4s, v16.s[3]
+       dup             v16.4s, v16.s[0]
+       dup             v21.4s, v17.s[1]
+       dup             v25.4s, v17.s[2]
+       dup             v29.4s, v17.s[3]
+       dup             v17.4s, v17.s[0]
+       dup             v22.4s, v18.s[1]
+       dup             v26.4s, v18.s[2]
+       dup             v30.4s, v18.s[3]
+       dup             v18.4s, v18.s[0]
+       dup             v23.4s, v19.s[1]
+       dup             v27.4s, v19.s[2]
+       dup             v31.4s, v19.s[3]
+       dup             v19.4s, v19.s[0]
+
+       add             v19.4s, v19.4s, TMP3.4s C low adds
+       cmhi    TMP1.4s, TMP3.4s, v19.4s        C compute carry-out
+       and             TMP1.16b, TMP1.16b, TMP2.16b    C discard carries for 32-bit counter variant
+       add             v23.4s, v23.4s, TMP1.4s C apply carries
+
+       C Save all 4x4 of the last words.
+       mov             T0.16b, v19.16b
+       mov             T1.16b, v23.16b
+       mov             T2.16b, v27.16b
+       mov             T3.16b, v31.16b
+
+.Loop:
+       QR(v16, v17,  v18, v19, v20, v21,  v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
+       QR(v16, v21, v26, v31, v20, v25, v30, v19, v24, v29,  v18, v23, v28, v17,  v22, v27)
+       subs    ROUNDS, ROUNDS, #2
+       b.ne    .Loop
+
+       C Add in saved original words, including counters, before
+       C transpose.
+       add             v19.4s, v19.4s, T0.4s
+       add             v23.4s, v23.4s, T1.4s
+       add             v27.4s, v27.4s, T2.4s
+       add             v31.4s, v31.4s, T3.4s
+
+       TRANSPOSE(v16, v20,v24, v28)
+       TRANSPOSE(v17, v21, v25, v29)
+       TRANSPOSE(v18, v22, v26, v30)
+       TRANSPOSE(v19, v23, v27, v31)
+
+       ld1             {T0.4s,T1.4s,T2.4s}, [SRC]
+
+       add             v16.4s, v16.4s, T0.4s
+       add             v20.4s, v20.4s, T0.4s
+       add             v24.4s, v24.4s, T0.4s
+       add             v28.4s, v28.4s, T0.4s
+
+       add             v17.4s, v17.4s, T1.4s
+       add             v21.4s, v21.4s, T1.4s
+       add             v25.4s, v25.4s, T1.4s
+       add             v29.4s, v29.4s, T1.4s
+
+       add             v18.4s, v18.4s, T2.4s
+       add             v22.4s, v22.4s, T2.4s
+       add             v26.4s, v26.4s, T2.4s
+       add             v30.4s, v30.4s, T2.4s
+
+       st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [DST], #64
+       st1             {v20.16b,v21.16b,v22.16b,v23.16b}, [DST], #64
+       st1             {v24.16b,v25.16b,v26.16b,v27.16b}, [DST], #64
+       st1             {v28.16b,v29.16b,v30.16b,v31.16b}, [DST]
+
+       C Restore callee-save registers
+       fmov    d8, x3
+       ret
+EPILOGUE(_nettle_chacha_4core)
+
+PROLOGUE(_nettle_chacha_4core32)
+       eor             TMP2.16b, TMP2.16b, TMP2.16b    C Ignore counter carries
+       b               .Lshared_entry
+EPILOGUE(_nettle_chacha_4core32)
+
+.align 4
+.Lcnts: .long  0,1,2,3         C increments
+.Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
diff --git a/arm64/asimd/chacha-core-internal.asm b/arm64/asimd/chacha-core-internal.asm

new file mode 100644 (file)

index 0000000..da10ad1
--- /dev/null
+++ b/arm64/asimd/chacha-core-internal.asm
@@ -0,0 +1,126 @@
+C arm64/asimd/chacha-core-internal.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+   Copyright (C) 2022 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `x0')
+define(`SRC', `x1')
+define(`ROUNDS', `x2')
+
+C Working state
+define(`X0', `v0')
+define(`X1', `v1')
+define(`X2', `v2')
+define(`X3', `v3')
+
+C Original input state
+define(`S0', `v4')
+define(`S1', `v5')
+define(`S2', `v6')
+define(`S3', `v7')
+
+define(`ROT24', `v16')
+
+define(`TMP', `v17')
+
+C QROUND(X0, X1, X2, X3)
+define(`QROUND', `
+       C x0 += x1, x3 ^= x0, x3 lrot 16
+       C x2 += x3, x1 ^= x2, x1 lrot 12
+       C x0 += x1, x3 ^= x0, x3 lrot 8
+       C x2 += x3, x1 ^= x2, x1 lrot 7
+
+       add             $1.4s, $1.4s, $2.4s
+       eor             $4.16b, $4.16b, $1.16b
+       rev32   $4.8h, $4.8h
+
+       add             $3.4s, $3.4s, $4.4s
+       eor             TMP.16b, $2.16b, $3.16b
+       ushr    $2.4s, TMP.4s, #20
+       sli             $2.4s, TMP.4s, #12
+
+       add             $1.4s, $1.4s, $2.4s
+       eor             $4.16b, $4.16b, $1.16b
+       tbl             $4.16b, {$4.16b}, ROT24.16b
+
+       add             $3.4s, $3.4s, $4.4s
+       eor             TMP.16b, $2.16b, $3.16b
+       ushr    $2.4s, TMP.4s, #25
+       sli             $2.4s, TMP.4s, #7
+')
+
+       .text
+       C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_chacha_core)
+       adr             x3, .Lrot24
+       ld1             {ROT24.4s},[x3]
+
+       ld1             {X0.4s,X1.4s,X2.4s,X3.4s}, [SRC]
+
+       mov             S0.16b, X0.16b
+       mov             S1.16b, X1.16b
+       mov             S2.16b, X2.16b
+       mov             S3.16b, X3.16b
+
+.Loop:
+       QROUND(X0, X1, X2, X3)
+       C Rotate rows, to get
+       C        0  1  2  3
+       C        5  6  7  4  <<< 1
+       C       10 11  8  9  <<< 2
+       C       15 12 13 14  <<< 3
+
+       ext             X1.16b, X1.16b, X1.16b, #4
+       ext             X2.16b, X2.16b, X2.16b, #8
+       ext             X3.16b, X3.16b, X3.16b, #12
+
+       QROUND(X0, X1, X2, X3)
+
+       ext             X1.16b, X1.16b, X1.16b, #12
+       ext             X2.16b, X2.16b, X2.16b, #8
+       ext             X3.16b, X3.16b, X3.16b, #4
+
+       subs    ROUNDS, ROUNDS, #2
+       b.ne    .Loop
+
+       add             X0.4s, X0.4s, S0.4s
+       add             X1.4s, X1.4s, S1.4s
+       add             X2.4s, X2.4s, S2.4s
+       add             X3.4s, X3.4s, S3.4s
+
+       st1             {X0.16b,X1.16b,X2.16b,X3.16b}, [DST]
+       ret
+EPILOGUE(_nettle_chacha_core)
+
+.align 4
+.Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
diff --git a/arm64/fat/chacha-2core.asm b/arm64/fat/chacha-2core.asm

new file mode 100644 (file)

index 0000000..cb1b95d
--- /dev/null
+++ b/arm64/fat/chacha-2core.asm
@@ -0,0 +1,36 @@
+C arm64/fat/chacha-2core.asm
+
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_2core) picked up by configure
+
+include_src(`arm64/asimd/chacha-2core.asm')
diff --git a/arm64/fat/chacha-4core.asm b/arm64/fat/chacha-4core.asm

new file mode 100644 (file)

index 0000000..2d89e6a
--- /dev/null
+++ b/arm64/fat/chacha-4core.asm
@@ -0,0 +1,36 @@
+C arm64/fat/chacha-4core.asm
+
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_4core) picked up by configure
+
+include_src(`arm64/asimd/chacha-4core.asm')
diff --git a/arm64/fat/chacha-core-internal-2.asm b/arm64/fat/chacha-core-internal-2.asm

new file mode 100644 (file)

index 0000000..dad4c69
--- /dev/null
+++ b/arm64/fat/chacha-core-internal-2.asm
@@ -0,0 +1,37 @@
+C arm64/fat/chacha-core-internal-2.asm
+
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_chacha_core) picked up by configure
+
+define(`fat_transform', `$1_arm64')
+include_src(`arm64/asimd/chacha-core-internal.asm')
diff --git a/configure.ac b/configure.ac

index da72f908ac9e251c01d7923fe487f2f416bd99b5..053535ba775d35ac7efbf0a1c105fd548b0faeca 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -81,6 +81,10 @@ AC_ARG_ENABLE(arm-neon,
    AC_HELP_STRING([--enable-arm-neon], [Enable ARM Neon assembly. (default=auto)]),,
    [enable_arm_neon=auto])
  
+AC_ARG_ENABLE(arm64-asimd,
+  AC_HELP_STRING([--enable-arm64-asimd], [Enable Arm64 advanced SIMD. (default=no)]),,
+  [enable_arm64_asimd=no])
+
  AC_ARG_ENABLE(arm64-crypto,
    AC_HELP_STRING([--enable-arm64-crypto], [Enable Arm64 crypto extension. (default=no)]),,
    [enable_arm64_crypto=no])
@@ -511,8 +515,11 @@ if test "x$enable_assembler" = xyes ; then
          if test "x$enable_fat" = xyes ; then
            asm_path="arm64/fat $asm_path"
            OPT_NETTLE_SOURCES="fat-arm64.c $OPT_NETTLE_SOURCES"
-          FAT_TEST_LIST="none aes pmull sha1 sha2"
+          FAT_TEST_LIST="none asimd aes pmull sha1 sha2"
          else
+          if test "$enable_arm64_asimd" = yes ; then
+            asm_path="arm64/asimd $asm_path"
+          fi
            if test "$enable_arm64_crypto" = yes ; then
              asm_path="arm64/crypto $asm_path"
            fi
diff --git a/fat-arm64.c b/fat-arm64.c

index fcb2ece815e499855773bf0bd2726e4e37742be8..af3c98ed32ae017d863a440b15da2ab988a9d4aa 100644 (file)
--- a/fat-arm64.c
+++ b/fat-arm64.c
@@ -74,6 +74,7 @@
  
  struct arm64_features
  {
+  int have_asimd;
    int have_aes;
    int have_pmull;
    int have_sha1;
@@ -87,6 +88,7 @@ static void
  get_arm64_features (struct arm64_features *features)
  {
    const char *s;
+  features->have_asimd = 0;
    features->have_aes = 0;
    features->have_pmull = 0;
    features->have_sha1 = 0;
@@ -99,7 +101,9 @@ get_arm64_features (struct arm64_features *features)
         const char *sep = strchr (s, ',');
         size_t length = sep ? (size_t) (sep - s) : strlen(s);
  
-       if (MATCH (s, length, "aes", 3))
+       if (MATCH (s, length, "asimd", 5))
+         features->have_asimd = 1;
+  else if (MATCH (s, length, "aes", 3))
           features->have_aes = 1;
    else if (MATCH (s, length, "pmull", 5))
           features->have_pmull = 1;
@@ -115,6 +119,8 @@ get_arm64_features (struct arm64_features *features)
      {
  #if USE_GETAUXVAL
        unsigned long hwcap = getauxval(AT_HWCAP);
+      features->have_asimd
+       = ((hwcap & HWCAP_ASIMD) == HWCAP_ASIMD);
        features->have_aes
         = ((hwcap & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES));
        features->have_pmull
@@ -166,6 +172,18 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func)
  DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, c)
  DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, arm64)
  
+DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func)
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c);
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, arm64);
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 4core)
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt32, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 4core)
+
  static void CONSTRUCTOR
  fat_init (void)
  {
@@ -176,8 +194,9 @@ fat_init (void)
  
    verbose = getenv (ENV_VERBOSE) != NULL;
    if (verbose)
-    fprintf (stderr, "libnettle: cpu features:%s%s%s%s\n",
-            features.have_aes ? " aes instructions" : "",
+    fprintf (stderr, "libnettle: cpu features:%s%s%s%s%s\n",
+            features.have_asimd ? " advanced simd" : "",
+       features.have_aes ? " aes instructions" : "",
              features.have_pmull ? " polynomial multiply long instructions (PMULL/PMULL2)" : "",
         features.have_sha1 ? " sha1 instructions" : "",
         features.have_sha2 ? " sha2 instructions" : "");
@@ -243,6 +262,20 @@ fat_init (void)
      {
        _nettle_sha256_compress_vec = _nettle_sha256_compress_c;
      }
+  if (features.have_asimd)
+    {
+      if (verbose)
+       fprintf (stderr, "libnettle: enabling advanced simd code.\n");
+      _nettle_chacha_core_vec = _nettle_chacha_core_arm64;
+      nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core;
+      nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core;
+    }
+  else
+    {
+      _nettle_chacha_core_vec = _nettle_chacha_core_c;
+      nettle_chacha_crypt_vec = _nettle_chacha_crypt_1core;
+      nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_1core;
+    }
  }
  
  DEFINE_FAT_FUNC(nettle_aes128_encrypt, void,
@@ -290,3 +323,21 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void,
  DEFINE_FAT_FUNC(_nettle_sha256_compress, void,
                 (uint32_t *state, const uint8_t *input, const uint32_t *k),
                 (state, input, k))
+
+DEFINE_FAT_FUNC(_nettle_chacha_core, void,
+               (uint32_t *dst, const uint32_t *src, unsigned rounds),
+               (dst, src, rounds))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt, void,
+               (struct chacha_ctx *ctx,
+                size_t length,
+                uint8_t *dst,
+                const uint8_t *src),
+               (ctx, length, dst, src))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt32, void,
+               (struct chacha_ctx *ctx,
+                size_t length,
+                uint8_t *dst,
+                const uint8_t *src),
+               (ctx, length, dst, src))
author	Mamone Tarsha <maamoun.tk@googlemail.com>
	Tue, 18 Jan 2022 17:27:32 +0000 (19:27 +0200)
committer	Mamone Tarsha <maamoun.tk@googlemail.com>
	Tue, 18 Jan 2022 17:27:32 +0000 (19:27 +0200)
Makefile.in		patch \| blob \| blame \| history
arm64/asimd/chacha-2core.asm	[new file with mode: 0644]	patch \| blob
arm64/asimd/chacha-4core.asm	[new file with mode: 0644]	patch \| blob
arm64/asimd/chacha-core-internal.asm	[new file with mode: 0644]	patch \| blob
arm64/fat/chacha-2core.asm	[new file with mode: 0644]	patch \| blob
arm64/fat/chacha-4core.asm	[new file with mode: 0644]	patch \| blob
arm64/fat/chacha-core-internal-2.asm	[new file with mode: 0644]	patch \| blob
configure.ac		patch \| blob \| blame \| history
fat-arm64.c		patch \| blob \| blame \| history