DECLARE_FAT_FUNC_VAR(sha3_permute, sha3_permute_func, c)
DECLARE_FAT_FUNC_VAR(sha3_permute, sha3_permute_func, s390x)
+DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func)
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c);
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, s390x);
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 4core)
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt32, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 4core)
+
static void CONSTRUCTOR
fat_init (void)
{
if (features.have_vector_facility)
{
if (verbose)
- fprintf (stderr, "libnettle: enabling vectorized memxor3.\n");
+ fprintf (stderr, "libnettle: enabling vector facility code.\n");
nettle_memxor3_vec = _nettle_memxor3_s390x;
-
- if (verbose)
- fprintf (stderr, "libnettle: enabling vectorized sha3 permute.\n");
nettle_sha3_permute_vec = _nettle_sha3_permute_s390x;
+ _nettle_chacha_core_vec = _nettle_chacha_core_s390x;
+ nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core;
+ nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core;
}
else
{
nettle_memxor3_vec = _nettle_memxor3_c;
-
nettle_sha3_permute_vec = _nettle_sha3_permute_c;
+ _nettle_chacha_core_vec = _nettle_chacha_core_c;
+ nettle_chacha_crypt_vec = _nettle_chacha_crypt_1core;
+ nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_1core;
}
/* AES128 */
/* SHA3 */
DEFINE_FAT_FUNC(nettle_sha3_permute, void,
(struct sha3_state *state), (state))
+
+DEFINE_FAT_FUNC(_nettle_chacha_core, void,
+ (uint32_t *dst, const uint32_t *src, unsigned rounds),
+ (dst, src, rounds))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt, void,
+ (struct chacha_ctx *ctx,
+ size_t length,
+ uint8_t *dst,
+ const uint8_t *src),
+ (ctx, length, dst, src))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt32, void,
+ (struct chacha_ctx *ctx,
+ size_t length,
+ uint8_t *dst,
+ const uint8_t *src),
+ (ctx, length, dst, src))
--- /dev/null
+C s390x/fat/chacha-2core.asm
+
+ifelse(`
+ Copyright (C) 2022 Mamone Tarsha
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_2core) picked up by configure
+
+include_src(`s390x/vf/chacha-2core.asm')
--- /dev/null
+C s390x/fat/chacha-4core.asm
+
+ifelse(`
+ Copyright (C) 2022 Mamone Tarsha
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_4core) picked up by configure
+
+include_src(`s390x/vf/chacha-4core.asm')
--- /dev/null
+C s390x/fat/chacha-core-internal-2.asm
+
+ifelse(`
+ Copyright (C) 2022 Mamone Tarsha
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_chacha_core) picked up by configure
+
+define(`fat_transform', `$1_s390x')
+include_src(`s390x/vf/chacha-core-internal.asm')
--- /dev/null
+C s390x/vf/chacha-2core.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+ Copyright (C) 2022 Mamone Tarsha
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `%r2')
+define(`SRC', `%r3')
+define(`ROUNDS', `%r4')
+
+C State, even elements in X, odd elements in Y
+define(`X0', `%v1')
+define(`X1', `%v2')
+define(`X2', `%v3')
+define(`X3', `%v29')
+define(`Y0', `%v4')
+define(`Y1', `%v5')
+define(`Y2', `%v6')
+define(`Y3', `%v7')
+
+C Original input state
+define(`S0', `%v24')
+define(`S1', `%v25')
+define(`S2', `%v26')
+define(`S3', `%v27')
+define(`S3p1', `%v28')
+
+define(`T0', `%v0')
+
+define(`BRW', `%v30')
+define(`EW', `%v30')
+define(`OW', `%v31')
+
+.file "chacha-2core.asm"
+.machine "z13"
+
+.text
+C _chacha_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+PROLOGUE(_nettle_chacha_2core)
+
+ vzero X1
+ vleif X1, 1, 0
+
+ vl X3, 48(SRC)
+
+ vaccf Y3, X3, X1 C Counter carry out
+ vsldb Y3, Y3, Y3, 12
+ vo Y3, Y3, X1
+
+.Lshared_entry:
+ vaf Y3, Y3, X3
+
+ vlm X0, X2, 0(SRC)
+
+ vlr S0, X0
+ vlr S1, X1
+ vlr S2, X2
+ vlr S3, X3
+ vlr S3p1, Y3
+
+ larl %r5,.Lword_even
+ vlm EW, OW, 0(%r5)
+
+ vperm Y0, X0, X0, OW C 1 1 3 3
+ vperm X0, X0, X0, EW C 0 0 2 2
+ vperm Y1, X1, X1, OW C 5 5 7 7
+ vperm X1, X1, X1, EW C 4 4 6 6
+ vperm Y2, X2, X2, OW C 9 9 11 11
+ vperm X2, X2, X2, EW C 8 8 10 10
+ vperm Y3, X3, S3p1, OW C 13 13 15 15
+ vperm X3, X3, S3p1, EW C 12 12 14 14
+
+ srlg ROUNDS, ROUNDS, 1
+.Loop:
+C Register layout (A is first block, B is second block)
+C
+C X0: A0 B0 A2 B2 Y0: A1 B1 A3 B3
+C X1: A4 B4 A6 B6 Y1: A5 B5 A7 B7
+C X2: A8 B8 A10 B10 Y2: A9 B9 A11 B11
+C X3: A12 B12 A14 B14 Y3: A13 B13 A15 B15
+ vaf X0, X0, X1
+ vaf Y0, Y0, Y1
+ vx X3, X3, X0
+ vx Y3, Y3, Y0
+ verllf X3, X3, 16
+ verllf Y3, Y3, 16
+
+ vaf X2, X2, X3
+ vaf Y2, Y2, Y3
+ vx X1, X1, X2
+ vx Y1, Y1, Y2
+ verllf X1, X1, 12
+ verllf Y1, Y1, 12
+
+ vaf X0, X0, X1
+ vaf Y0, Y0, Y1
+ vx X3, X3, X0
+ vx Y3, Y3, Y0
+ verllf X3, X3, 8
+ verllf Y3, Y3, 8
+
+ vaf X2, X2, X3
+ vaf Y2, Y2, Y3
+ vx X1, X1, X2
+ vx Y1, Y1, Y2
+ verllf X1, X1, 7
+ verllf Y1, Y1, 7
+
+ vpdi X1, X1, X1, 0b0100
+ vpdi X2, X2, X2, 0b0100
+ vpdi Y2, Y2, Y2, 0b0100
+ vpdi Y3, Y3, Y3, 0b0100
+
+C Register layout:
+C X0: A0 B0 A2 B2 Y0: A1 B1 A3 B3
+C Y1: A5 B5 A7 B7 X1: A6 B6 A4 B4 (X1 swapped)
+C X2: A10 B10 A8 B8 Y2: A11 A11 A9 B9 (X2, Y2 swapped)
+C Y3 A15 B15 A13 B13 X3 A12 B12 A14 B14 (Y3 swapped)
+
+ vaf X0, X0, Y1
+ vaf Y0, Y0, X1
+ vx Y3, Y3, X0
+ vx X3, X3, Y0
+ verllf Y3, Y3, 16
+ verllf X3, X3, 16
+
+ vaf X2, X2, Y3
+ vaf Y2, Y2, X3
+ vx Y1, Y1, X2
+ vx X1, X1, Y2
+ verllf Y1, Y1, 12
+ verllf X1, X1, 12
+
+ vaf X0, X0, Y1
+ vaf Y0, Y0, X1
+ vx Y3, Y3, X0
+ vx X3, X3, Y0
+ verllf Y3, Y3, 8
+ verllf X3, X3, 8
+
+ vaf X2, X2, Y3
+ vaf Y2, Y2, X3
+ vx Y1, Y1, X2
+ vx X1, X1, Y2
+ verllf Y1, Y1, 7
+ verllf X1, X1, 7
+
+ vpdi X1, X1, X1, 0b0100
+ vpdi X2, X2, X2, 0b0100
+ vpdi Y2, Y2, Y2, 0b0100
+ vpdi Y3, Y3, Y3, 0b0100
+
+ brctg ROUNDS, .Loop
+
+ vperm T0, X0, Y0, EW
+ vperm Y0, X0, Y0, OW
+
+ vperm X0, X1, Y1, EW
+ vperm Y1, X1, Y1, OW
+
+ vperm X1, X2, Y2, EW
+ vperm Y2, X2, Y2, OW
+
+ vperm X2, X3, Y3, EW
+ vperm Y3, X3, Y3, OW
+
+ vaf T0, T0, S0
+ vaf Y0, Y0, S0
+ vaf X0, X0, S1
+ vaf Y1, Y1, S1
+ vaf X1, X1, S2
+ vaf Y2, Y2, S2
+ vaf X2, X2, S3
+ vaf Y3, Y3, S3p1
+
+ vl BRW, 32(%r5)
+ vperm T0, T0, T0, BRW
+ vperm X0, X0, X0, BRW
+ vperm X1, X1, X1, BRW
+ vperm X2, X2, X2, BRW
+ vperm Y0, Y0, Y0, BRW
+ vperm Y1, Y1, Y1, BRW
+ vperm Y2, Y2, Y2, BRW
+ vperm Y3, Y3, Y3, BRW
+
+ vstm T0, Y3, 0(DST)
+ br RA
+EPILOGUE(_nettle_chacha_2core)
+
+PROLOGUE(_nettle_chacha_2core32)
+ vzero Y3
+ vleif Y3, 1, 0
+ vl X3, 48(SRC)
+ j .Lshared_entry
+EPILOGUE(_nettle_chacha_2core32)
+
+.align 16
+.Lword_even: .long 0x00010203,0x10111213,0x08090A0B,0x18191A1B
+.Lword_odd: .long 0x04050607,0x14151617,0x0C0D0E0F,0x1C1D1E1F
+.Lword_byte_reverse: .long 0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C
--- /dev/null
+C s390x/vf/chacha-4core.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+ Copyright (C) 2022 Mamone Tarsha
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `%r2')
+define(`SRC', `%r3')
+define(`ROUNDS', `%r4')
+
+C Working state
+
+define(`BRW', `%v24')
+
+C During the loop, used to save the original values for last 4 words
+C of each block. Also used as temporaries for transpose.
+define(`T0', `%v25')
+define(`T1', `%v26')
+define(`T2', `%v27')
+define(`T3', `%v28')
+
+C Main loop for round
+define(`QR',`
+ vaf $1, $1, $2
+ vaf $5, $5, $6
+ vaf $9, $9, $10
+ vaf $13, $13, $14
+ vx $4, $4, $1
+ vx $8, $8, $5
+ vx $12, $12, $9
+ vx $16, $16, $13
+ verllf $4, $4, 16
+ verllf $8, $8, 16
+ verllf $12, $12, 16
+ verllf $16, $16, 16
+
+ vaf $3, $3, $4
+ vaf $7, $7, $8
+ vaf $11, $11, $12
+ vaf $15, $15, $16
+ vx $2, $2, $3
+ vx $6, $6, $7
+ vx $10, $10, $11
+ vx $14, $14, $15
+ verllf $2, $2, 12
+ verllf $6, $6, 12
+ verllf $10, $10, 12
+ verllf $14, $14, 12
+
+ vaf $1, $1, $2
+ vaf $5, $5, $6
+ vaf $9, $9, $10
+ vaf $13, $13, $14
+ vx $4, $4, $1
+ vx $8, $8, $5
+ vx $12, $12, $9
+ vx $16, $16, $13
+ verllf $4, $4, 8
+ verllf $8, $8, 8
+ verllf $12, $12, 8
+ verllf $16, $16, 8
+
+ vaf $3, $3, $4
+ vaf $7, $7, $8
+ vaf $11, $11, $12
+ vaf $15, $15, $16
+ vx $2, $2, $3
+ vx $6, $6, $7
+ vx $10, $10, $11
+ vx $14, $14, $15
+ verllf $2, $2, 7
+ verllf $6, $6, 7
+ verllf $10, $10, 7
+ verllf $14, $14, 7
+')
+
+define(`TRANSPOSE',`
+ vmrhf T0, $1, $3 C A0 A2 B0 B2
+ vmrhf T1, $2, $4 C A1 A3 B1 B3
+ vmrlf T2, $1, $3 C C0 C2 D0 D2
+ vmrlf T3, $2, $4 C C1 C3 D1 D3
+
+ vmrhf $1, T0, T1 C A0 A1 A2 A3
+ vmrlf $2, T0, T1 C B0 B1 B2 B3
+ vmrhf $3, T2, T3 C C0 C2 C1 C3
+ vmrlf $4, T2, T3 C D0 D1 D2 D3
+')
+
+.file "chacha-4core.asm"
+.machine "z13"
+
+.text
+C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+PROLOGUE(_nettle_chacha_4core)
+
+ vrepif T2, 1 C Apply counter carries
+
+.Lshared_entry:
+
+ C Save callee-save registers
+ ALLOC_STACK(%r1,64) C Allocate 64-byte space on stack
+ C Save non-volatile floating point registers
+ std %f8,0(%r1)
+ std %f9,8(%r1)
+ std %f10,16(%r1)
+ std %f11,24(%r1)
+ std %f12,32(%r1)
+ std %f13,40(%r1)
+ std %f14,48(%r1)
+ std %f15,56(%r1)
+
+ larl %r5,.Lword_byte_reverse
+ vlm BRW, T0, 0(%r5)
+
+C Load state and splat
+ vlm %v0, %v3, 0(SRC)
+
+ vrepf %v4, %v0, 1
+ vrepf %v8, %v0, 2
+ vrepf %v12, %v0, 3
+ vrepf %v0, %v0, 0
+ vrepf %v5, %v1, 1
+ vrepf %v9, %v1, 2
+ vrepf %v13, %v1, 3
+ vrepf %v1, %v1, 0
+ vrepf %v6, %v2, 1
+ vrepf %v10, %v2, 2
+ vrepf %v14, %v2, 3
+ vrepf %v2, %v2, 0
+ vrepf %v7, %v3, 1
+ vrepf %v11, %v3, 2
+ vrepf %v15, %v3, 3
+ vrepf %v3, %v3, 0
+
+ vaccf T1, %v3, T0 C low adds
+ vaf %v3, %v3, T0 C compute carry-out
+ vn T1, T1, T2 C discard carries for 32-bit counter variant
+ vaf %v7, %v7, T1 C apply carries
+
+ C Save all 4x4 of the last words.
+ vlr T0, %v3
+ vlr T1, %v7
+ vlr T2, %v11
+ vlr T3, %v15
+
+ srlg ROUNDS, ROUNDS, 1
+
+.Loop:
+ QR(%v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7, %v8, %v9, %v10, %v11, %v12, %v13, %v14, %v15)
+ QR(%v0, %v5, %v10, %v15, %v4, %v9, %v14, %v3, %v8, %v13, %v2, %v7, %v12, %v1, %v6, %v11)
+ brctg ROUNDS, .Loop
+
+ C Add in saved original words, including counters, before
+ C transpose.
+ vaf %v3, %v3, T0
+ vaf %v7, %v7, T1
+ vaf %v11, %v11, T2
+ vaf %v15, %v15, T3
+
+ TRANSPOSE(%v0, %v4, %v8, %v12)
+ TRANSPOSE(%v1, %v5, %v9, %v13)
+ TRANSPOSE(%v2, %v6, %v10, %v14)
+ TRANSPOSE(%v3, %v7, %v11, %v15)
+
+ vlm T0, T2, 0(SRC)
+
+ vaf %v0, %v0, T0
+ vaf %v4, %v4, T0
+ vaf %v8, %v8, T0
+ vaf %v12, %v12, T0
+
+ vperm %v0, %v0, %v0, BRW
+ vperm %v4, %v4, %v4, BRW
+ vperm %v8, %v8, %v8, BRW
+ vperm %v12, %v12, %v12, BRW
+
+ vaf %v1, %v1, T1
+ vaf %v5, %v5, T1
+ vaf %v9, %v9, T1
+ vaf %v13, %v13, T1
+
+ vperm %v1, %v1, %v1, BRW
+ vperm %v5, %v5, %v5, BRW
+ vperm %v9, %v9, %v9, BRW
+ vperm %v13, %v13, %v13, BRW
+
+ vaf %v2, %v2, T2
+ vaf %v6, %v6, T2
+ vaf %v10, %v10, T2
+ vaf %v14, %v14, T2
+
+ vperm %v2, %v2, %v2, BRW
+ vperm %v6, %v6, %v6, BRW
+ vperm %v10, %v10, %v10, BRW
+ vperm %v14, %v14, %v14, BRW
+
+ vperm %v3, %v3, %v3, BRW
+ vperm %v7, %v7, %v7, BRW
+ vperm %v11, %v11, %v11, BRW
+ vperm %v15, %v15, %v15, BRW
+
+ vstm %v0, %v15, 0(DST)
+
+ C Restore callee-save registers
+ ld %f8,0(%r1)
+ ld %f9,8(%r1)
+ ld %f10,16(%r1)
+ ld %f11,24(%r1)
+ ld %f12,32(%r1)
+ ld %f13,40(%r1)
+ ld %f14,48(%r1)
+ ld %f15,56(%r1)
+ FREE_STACK(64) C Deallocate stack space
+ br RA
+EPILOGUE(_nettle_chacha_4core)
+
+PROLOGUE(_nettle_chacha_4core32)
+
+ vzero T2 C Ignore counter carries
+ j .Lshared_entry
+EPILOGUE(_nettle_chacha_4core32)
+
+.align 16
+.Lword_byte_reverse: .long 0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C
+.Lcnts: .long 0,1,2,3 C increments
--- /dev/null
+C s390x/vf/chacha-2core.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+ Copyright (C) 2022 Mamone Tarsha
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `%r2')
+define(`SRC', `%r3')
+define(`ROUNDS', `%r4')
+
+C Working state
+define(`X0', `%v0')
+define(`X1', `%v1')
+define(`X2', `%v2')
+define(`X3', `%v3')
+
+C Original input state
+define(`S0', `%v4')
+define(`S1', `%v5')
+define(`S2', `%v6')
+define(`S3', `%v7')
+
+define(`BRW', `%v24')
+
+C QROUND(X0, X1, X2, X3)
+define(`QROUND', `
+ C x0 += x1, x3 ^= x0, x3 lrot 16
+ C x2 += x3, x1 ^= x2, x1 lrot 12
+ C x0 += x1, x3 ^= x0, x3 lrot 8
+ C x2 += x3, x1 ^= x2, x1 lrot 7
+
+ vaf $1, $1, $2
+ vx $4, $4, $1
+ verllf $4, $4, 16
+
+ vaf $3, $3, $4
+ vx $2, $2, $3
+ verllf $2, $2, 12
+
+ vaf $1, $1, $2
+ vx $4, $4, $1
+ verllf $4, $4, 8
+
+ vaf $3, $3, $4
+ vx $2, $2, $3
+ verllf $2, $2, 7
+')
+
+.file "chacha-core-internal.asm"
+.machine "z13"
+
+.text
+C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+PROLOGUE(_nettle_chacha_core)
+ vlm X0, X3, 0(SRC)
+
+ vlr S0, X0
+ vlr S1, X1
+ vlr S2, X2
+ vlr S3, X3
+
+ srlg ROUNDS, ROUNDS, 1
+.Loop:
+QROUND(X0, X1, X2, X3)
+ C Rotate rows, to get
+ C 0 1 2 3
+ C 5 6 7 4 <<< 1
+ C 10 11 8 9 <<< 2
+ C 15 12 13 14 <<< 3
+
+ vsldb X1, X1, X1, 4
+ vsldb X2, X2, X2, 8
+ vsldb X3, X3, X3, 12
+
+ QROUND(X0, X1, X2, X3)
+
+ C Inverse rotation
+ vsldb X1, X1, X1, 12
+ vsldb X2, X2, X2, 8
+ vsldb X3, X3, X3, 4
+
+ brctg ROUNDS, .Loop
+
+ vaf X0, X0, S0
+ vaf X1, X1, S1
+ vaf X2, X2, S2
+ vaf X3, X3, S3
+
+ larl %r5,.Lword_byte_reverse
+ vl BRW, 0(%r5)
+ vperm X0, X0, X0, BRW
+ vperm X1, X1, X1, BRW
+ vperm X2, X2, X2, BRW
+ vperm X3, X3, X3, BRW
+
+ vstm X0, X3, 0(DST)
+ br RA
+EPILOGUE(_nettle_chacha_core)
+
+.align 16
+.Lword_byte_reverse: .long 0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C