1 /* Optimized AArch64 implementation of ChaCha20 cipher.
2 Copyright (C) 2022 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 /* Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
22 This file is part of Libgcrypt.
24 Libgcrypt is free software; you can redistribute it and/or modify
25 it under the terms of the GNU Lesser General Public License as
26 published by the Free Software Foundation; either version 2.1 of
27 the License, or (at your option) any later version.
29 Libgcrypt is distributed in the hope that it will be useful,
30 but WITHOUT ANY WARRANTY; without even the implied warranty of
31 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
32 GNU Lesser General Public License for more details.
34 You should have received a copy of the GNU Lesser General Public
35 License along with this program; if not, see <https://www.gnu.org/licenses/>.
38 /* Based on D. J. Bernstein reference implementation at
39 http://cr.yp.to/chacha.html:
41 chacha-regs.c version 20080118
47 /* Only LE is supported. */
50 #define GET_DATA_POINTER(reg, name) \
52 add reg, reg, :lo12:name
54 /* 'ret' instruction replacement for straight-line speculation mitigation */
55 #define ret_spec_stop \
72 /* vector registers */
103 /**********************************************************************
105 **********************************************************************/
107 #define _(...) __VA_ARGS__
109 #define vpunpckldq(s1, s2, dst) \
110 zip1 dst.4s, s2.4s, s1.4s;
112 #define vpunpckhdq(s1, s2, dst) \
113 zip2 dst.4s, s2.4s, s1.4s;
115 #define vpunpcklqdq(s1, s2, dst) \
116 zip1 dst.2d, s2.2d, s1.2d;
118 #define vpunpckhqdq(s1, s2, dst) \
119 zip2 dst.2d, s2.2d, s1.2d;
121 /* 4x4 32-bit integer matrix transpose */
122 #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
123 vpunpckhdq(x1, x0, t2); \
124 vpunpckldq(x1, x0, x0); \
126 vpunpckldq(x3, x2, t1); \
127 vpunpckhdq(x3, x2, x2); \
129 vpunpckhqdq(t1, x0, x1); \
130 vpunpcklqdq(t1, x0, x0); \
132 vpunpckhqdq(x2, t2, x3); \
133 vpunpcklqdq(x2, t2, x2);
135 /**********************************************************************
137 **********************************************************************/
139 #define XOR(d,s1,s2) \
140 eor d.16b, s2.16b, s1.16b;
143 add ds.4s, ds.4s, s.4s;
145 #define ROTATE4(dst1,dst2,dst3,dst4,c,src1,src2,src3,src4) \
146 shl dst1.4s, src1.4s, #(c); \
147 shl dst2.4s, src2.4s, #(c); \
148 shl dst3.4s, src3.4s, #(c); \
149 shl dst4.4s, src4.4s, #(c); \
150 sri dst1.4s, src1.4s, #(32 - (c)); \
151 sri dst2.4s, src2.4s, #(32 - (c)); \
152 sri dst3.4s, src3.4s, #(32 - (c)); \
153 sri dst4.4s, src4.4s, #(32 - (c));
155 #define ROTATE4_8(dst1,dst2,dst3,dst4,src1,src2,src3,src4) \
156 tbl dst1.16b, {src1.16b}, ROT8.16b; \
157 tbl dst2.16b, {src2.16b}, ROT8.16b; \
158 tbl dst3.16b, {src3.16b}, ROT8.16b; \
159 tbl dst4.16b, {src4.16b}, ROT8.16b;
161 #define ROTATE4_16(dst1,dst2,dst3,dst4,src1,src2,src3,src4) \
162 rev32 dst1.8h, src1.8h; \
163 rev32 dst2.8h, src2.8h; \
164 rev32 dst3.8h, src3.8h; \
165 rev32 dst4.8h, src4.8h;
167 #define QUARTERROUND4(a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,a4,b4,c4,d4,ign,tmp1,tmp2,tmp3,tmp4) \
168 PLUS(a1,b1); PLUS(a2,b2); \
169 PLUS(a3,b3); PLUS(a4,b4); \
170 XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \
171 XOR(tmp3,d3,a3); XOR(tmp4,d4,a4); \
172 ROTATE4_16(d1, d2, d3, d4, tmp1, tmp2, tmp3, tmp4); \
173 PLUS(c1,d1); PLUS(c2,d2); \
174 PLUS(c3,d3); PLUS(c4,d4); \
175 XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \
176 XOR(tmp3,b3,c3); XOR(tmp4,b4,c4); \
177 ROTATE4(b1, b2, b3, b4, 12, tmp1, tmp2, tmp3, tmp4) \
178 PLUS(a1,b1); PLUS(a2,b2); \
179 PLUS(a3,b3); PLUS(a4,b4); \
180 XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \
181 XOR(tmp3,d3,a3); XOR(tmp4,d4,a4); \
182 ROTATE4_8(d1, d2, d3, d4, tmp1, tmp2, tmp3, tmp4) \
183 PLUS(c1,d1); PLUS(c2,d2); \
184 PLUS(c3,d3); PLUS(c4,d4); \
185 XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \
186 XOR(tmp3,b3,c3); XOR(tmp4,b4,c4); \
187 ROTATE4(b1, b2, b3, b4, 7, tmp1, tmp2, tmp3, tmp4) \
190 L(__chacha20_blocks4_data_inc_counter):
194 L(__chacha20_blocks4_data_rot8):
200 .hidden __chacha20_neon_blocks4
201 ENTRY (__chacha20_neon_blocks4)
206 * x3: nblks (multiple of 4)
209 GET_DATA_POINTER(CTR, L(__chacha20_blocks4_data_rot8))
210 add INPUT_CTR, INPUT, #(12*4);
211 ld1 {ROT8.16b}, [CTR];
212 GET_DATA_POINTER(CTR, L(__chacha20_blocks4_data_inc_counter))
213 mov INPUT_POS, INPUT;
214 ld1 {VCTR.16b}, [CTR];
217 /* Construct counter vectors X12 and X13 */
219 ld1 {X15.16b}, [INPUT_CTR];
221 ld1 {VTMP1.16b-VTMP3.16b}, [INPUT_POS];
223 dup X12.4s, X15.s[0];
224 dup X13.4s, X15.s[1];
225 ldr CTR, [INPUT_CTR];
226 add X12.4s, X12.4s, VCTR.4s;
227 dup X0.4s, VTMP1.s[0];
228 dup X1.4s, VTMP1.s[1];
229 dup X2.4s, VTMP1.s[2];
230 dup X3.4s, VTMP1.s[3];
231 dup X14.4s, X15.s[2];
232 cmhi VTMP0.4s, VCTR.4s, X12.4s;
233 dup X15.4s, X15.s[3];
234 add CTR, CTR, #4; /* Update counter */
235 dup X4.4s, VTMP2.s[0];
236 dup X5.4s, VTMP2.s[1];
237 dup X6.4s, VTMP2.s[2];
238 dup X7.4s, VTMP2.s[3];
239 sub X13.4s, X13.4s, VTMP0.4s;
240 dup X8.4s, VTMP3.s[0];
241 dup X9.4s, VTMP3.s[1];
242 dup X10.4s, VTMP3.s[2];
243 dup X11.4s, VTMP3.s[3];
244 mov X12_TMP.16b, X12.16b;
245 mov X13_TMP.16b, X13.16b;
246 str CTR, [INPUT_CTR];
249 subs ROUND, ROUND, #2
250 QUARTERROUND4(X0, X4, X8, X12, X1, X5, X9, X13,
251 X2, X6, X10, X14, X3, X7, X11, X15,
252 tmp:=,VTMP0,VTMP1,VTMP2,VTMP3)
253 QUARTERROUND4(X0, X5, X10, X15, X1, X6, X11, X12,
254 X2, X7, X8, X13, X3, X4, X9, X14,
255 tmp:=,VTMP0,VTMP1,VTMP2,VTMP3)
258 ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS], #32;
260 PLUS(X12, X12_TMP); /* INPUT + 12 * 4 + counter */
261 PLUS(X13, X13_TMP); /* INPUT + 13 * 4 + counter */
263 dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 0 * 4 */
264 dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 1 * 4 */
265 dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 2 * 4 */
266 dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 3 * 4 */
272 dup VTMP2.4s, VTMP1.s[0]; /* INPUT + 4 * 4 */
273 dup VTMP3.4s, VTMP1.s[1]; /* INPUT + 5 * 4 */
274 dup X12_TMP.4s, VTMP1.s[2]; /* INPUT + 6 * 4 */
275 dup X13_TMP.4s, VTMP1.s[3]; /* INPUT + 7 * 4 */
276 ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS];
277 mov INPUT_POS, INPUT;
283 dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 8 * 4 */
284 dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 9 * 4 */
285 dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 10 * 4 */
286 dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 11 * 4 */
287 dup VTMP0.4s, VTMP1.s[2]; /* INPUT + 14 * 4 */
288 dup VTMP1.4s, VTMP1.s[3]; /* INPUT + 15 * 4 */
296 transpose_4x4(X0, X1, X2, X3, VTMP0, VTMP1, VTMP2);
297 transpose_4x4(X4, X5, X6, X7, VTMP0, VTMP1, VTMP2);
298 transpose_4x4(X8, X9, X10, X11, VTMP0, VTMP1, VTMP2);
299 transpose_4x4(X12, X13, X14, X15, VTMP0, VTMP1, VTMP2);
301 subs NBLKS, NBLKS, #4;
303 st1 {X0.16b,X4.16B,X8.16b, X12.16b}, [DST], #64
304 st1 {X1.16b,X5.16b}, [DST], #32;
305 st1 {X9.16b, X13.16b, X2.16b, X6.16b}, [DST], #64
306 st1 {X10.16b,X14.16b}, [DST], #32;
307 st1 {X3.16b, X7.16b, X11.16b, X15.16b}, [DST], #64;
312 END (__chacha20_neon_blocks4)