]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/aarch64/chacha20-aarch64.S
cce5291c5c493701f1edf66ca201385c6ad256ed
[thirdparty/glibc.git] / sysdeps / aarch64 / chacha20-aarch64.S
1 /* Optimized AArch64 implementation of ChaCha20 cipher.
2 Copyright (C) 2022 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20 /* Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
21
22 This file is part of Libgcrypt.
23
24 Libgcrypt is free software; you can redistribute it and/or modify
25 it under the terms of the GNU Lesser General Public License as
26 published by the Free Software Foundation; either version 2.1 of
27 the License, or (at your option) any later version.
28
29 Libgcrypt is distributed in the hope that it will be useful,
30 but WITHOUT ANY WARRANTY; without even the implied warranty of
31 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
32 GNU Lesser General Public License for more details.
33
34 You should have received a copy of the GNU Lesser General Public
35 License along with this program; if not, see <https://www.gnu.org/licenses/>.
36 */
37
38 /* Based on D. J. Bernstein reference implementation at
39 http://cr.yp.to/chacha.html:
40
41 chacha-regs.c version 20080118
42 D. J. Bernstein
43 Public domain. */
44
45 #include <sysdep.h>
46
47 /* Only LE is supported. */
48 #ifdef __AARCH64EL__
49
50 #define GET_DATA_POINTER(reg, name) \
51 adrp reg, name ; \
52 add reg, reg, :lo12:name
53
54 /* 'ret' instruction replacement for straight-line speculation mitigation */
55 #define ret_spec_stop \
56 ret; dsb sy; isb;
57
58 .cpu generic+simd
59
60 .text
61
62 /* register macros */
63 #define INPUT x0
64 #define DST x1
65 #define SRC x2
66 #define NBLKS x3
67 #define ROUND x4
68 #define INPUT_CTR x5
69 #define INPUT_POS x6
70 #define CTR x7
71
72 /* vector registers */
73 #define X0 v16
74 #define X4 v17
75 #define X8 v18
76 #define X12 v19
77
78 #define X1 v20
79 #define X5 v21
80
81 #define X9 v22
82 #define X13 v23
83 #define X2 v24
84 #define X6 v25
85
86 #define X3 v26
87 #define X7 v27
88 #define X11 v28
89 #define X15 v29
90
91 #define X10 v30
92 #define X14 v31
93
94 #define VCTR v0
95 #define VTMP0 v1
96 #define VTMP1 v2
97 #define VTMP2 v3
98 #define VTMP3 v4
99 #define X12_TMP v5
100 #define X13_TMP v6
101 #define ROT8 v7
102
103 /**********************************************************************
104 helper macros
105 **********************************************************************/
106
107 #define _(...) __VA_ARGS__
108
109 #define vpunpckldq(s1, s2, dst) \
110 zip1 dst.4s, s2.4s, s1.4s;
111
112 #define vpunpckhdq(s1, s2, dst) \
113 zip2 dst.4s, s2.4s, s1.4s;
114
115 #define vpunpcklqdq(s1, s2, dst) \
116 zip1 dst.2d, s2.2d, s1.2d;
117
118 #define vpunpckhqdq(s1, s2, dst) \
119 zip2 dst.2d, s2.2d, s1.2d;
120
121 /* 4x4 32-bit integer matrix transpose */
122 #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
123 vpunpckhdq(x1, x0, t2); \
124 vpunpckldq(x1, x0, x0); \
125 \
126 vpunpckldq(x3, x2, t1); \
127 vpunpckhdq(x3, x2, x2); \
128 \
129 vpunpckhqdq(t1, x0, x1); \
130 vpunpcklqdq(t1, x0, x0); \
131 \
132 vpunpckhqdq(x2, t2, x3); \
133 vpunpcklqdq(x2, t2, x2);
134
135 /**********************************************************************
136 4-way chacha20
137 **********************************************************************/
138
139 #define XOR(d,s1,s2) \
140 eor d.16b, s2.16b, s1.16b;
141
142 #define PLUS(ds,s) \
143 add ds.4s, ds.4s, s.4s;
144
145 #define ROTATE4(dst1,dst2,dst3,dst4,c,src1,src2,src3,src4) \
146 shl dst1.4s, src1.4s, #(c); \
147 shl dst2.4s, src2.4s, #(c); \
148 shl dst3.4s, src3.4s, #(c); \
149 shl dst4.4s, src4.4s, #(c); \
150 sri dst1.4s, src1.4s, #(32 - (c)); \
151 sri dst2.4s, src2.4s, #(32 - (c)); \
152 sri dst3.4s, src3.4s, #(32 - (c)); \
153 sri dst4.4s, src4.4s, #(32 - (c));
154
155 #define ROTATE4_8(dst1,dst2,dst3,dst4,src1,src2,src3,src4) \
156 tbl dst1.16b, {src1.16b}, ROT8.16b; \
157 tbl dst2.16b, {src2.16b}, ROT8.16b; \
158 tbl dst3.16b, {src3.16b}, ROT8.16b; \
159 tbl dst4.16b, {src4.16b}, ROT8.16b;
160
161 #define ROTATE4_16(dst1,dst2,dst3,dst4,src1,src2,src3,src4) \
162 rev32 dst1.8h, src1.8h; \
163 rev32 dst2.8h, src2.8h; \
164 rev32 dst3.8h, src3.8h; \
165 rev32 dst4.8h, src4.8h;
166
167 #define QUARTERROUND4(a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,a4,b4,c4,d4,ign,tmp1,tmp2,tmp3,tmp4) \
168 PLUS(a1,b1); PLUS(a2,b2); \
169 PLUS(a3,b3); PLUS(a4,b4); \
170 XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \
171 XOR(tmp3,d3,a3); XOR(tmp4,d4,a4); \
172 ROTATE4_16(d1, d2, d3, d4, tmp1, tmp2, tmp3, tmp4); \
173 PLUS(c1,d1); PLUS(c2,d2); \
174 PLUS(c3,d3); PLUS(c4,d4); \
175 XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \
176 XOR(tmp3,b3,c3); XOR(tmp4,b4,c4); \
177 ROTATE4(b1, b2, b3, b4, 12, tmp1, tmp2, tmp3, tmp4) \
178 PLUS(a1,b1); PLUS(a2,b2); \
179 PLUS(a3,b3); PLUS(a4,b4); \
180 XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); \
181 XOR(tmp3,d3,a3); XOR(tmp4,d4,a4); \
182 ROTATE4_8(d1, d2, d3, d4, tmp1, tmp2, tmp3, tmp4) \
183 PLUS(c1,d1); PLUS(c2,d2); \
184 PLUS(c3,d3); PLUS(c4,d4); \
185 XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); \
186 XOR(tmp3,b3,c3); XOR(tmp4,b4,c4); \
187 ROTATE4(b1, b2, b3, b4, 7, tmp1, tmp2, tmp3, tmp4) \
188
189 .align 4
190 L(__chacha20_blocks4_data_inc_counter):
191 .long 0,1,2,3
192
193 .align 4
194 L(__chacha20_blocks4_data_rot8):
195 .byte 3,0,1,2
196 .byte 7,4,5,6
197 .byte 11,8,9,10
198 .byte 15,12,13,14
199
200 .hidden __chacha20_neon_blocks4
201 ENTRY (__chacha20_neon_blocks4)
202 /* input:
203 * x0: input
204 * x1: dst
205 * x2: src
206 * x3: nblks (multiple of 4)
207 */
208
209 GET_DATA_POINTER(CTR, L(__chacha20_blocks4_data_rot8))
210 add INPUT_CTR, INPUT, #(12*4);
211 ld1 {ROT8.16b}, [CTR];
212 GET_DATA_POINTER(CTR, L(__chacha20_blocks4_data_inc_counter))
213 mov INPUT_POS, INPUT;
214 ld1 {VCTR.16b}, [CTR];
215
216 L(loop4):
217 /* Construct counter vectors X12 and X13 */
218
219 ld1 {X15.16b}, [INPUT_CTR];
220 mov ROUND, #20;
221 ld1 {VTMP1.16b-VTMP3.16b}, [INPUT_POS];
222
223 dup X12.4s, X15.s[0];
224 dup X13.4s, X15.s[1];
225 ldr CTR, [INPUT_CTR];
226 add X12.4s, X12.4s, VCTR.4s;
227 dup X0.4s, VTMP1.s[0];
228 dup X1.4s, VTMP1.s[1];
229 dup X2.4s, VTMP1.s[2];
230 dup X3.4s, VTMP1.s[3];
231 dup X14.4s, X15.s[2];
232 cmhi VTMP0.4s, VCTR.4s, X12.4s;
233 dup X15.4s, X15.s[3];
234 add CTR, CTR, #4; /* Update counter */
235 dup X4.4s, VTMP2.s[0];
236 dup X5.4s, VTMP2.s[1];
237 dup X6.4s, VTMP2.s[2];
238 dup X7.4s, VTMP2.s[3];
239 sub X13.4s, X13.4s, VTMP0.4s;
240 dup X8.4s, VTMP3.s[0];
241 dup X9.4s, VTMP3.s[1];
242 dup X10.4s, VTMP3.s[2];
243 dup X11.4s, VTMP3.s[3];
244 mov X12_TMP.16b, X12.16b;
245 mov X13_TMP.16b, X13.16b;
246 str CTR, [INPUT_CTR];
247
248 L(round2):
249 subs ROUND, ROUND, #2
250 QUARTERROUND4(X0, X4, X8, X12, X1, X5, X9, X13,
251 X2, X6, X10, X14, X3, X7, X11, X15,
252 tmp:=,VTMP0,VTMP1,VTMP2,VTMP3)
253 QUARTERROUND4(X0, X5, X10, X15, X1, X6, X11, X12,
254 X2, X7, X8, X13, X3, X4, X9, X14,
255 tmp:=,VTMP0,VTMP1,VTMP2,VTMP3)
256 b.ne L(round2);
257
258 ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS], #32;
259
260 PLUS(X12, X12_TMP); /* INPUT + 12 * 4 + counter */
261 PLUS(X13, X13_TMP); /* INPUT + 13 * 4 + counter */
262
263 dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 0 * 4 */
264 dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 1 * 4 */
265 dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 2 * 4 */
266 dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 3 * 4 */
267 PLUS(X0, VTMP2);
268 PLUS(X1, VTMP3);
269 PLUS(X2, X12_TMP);
270 PLUS(X3, X13_TMP);
271
272 dup VTMP2.4s, VTMP1.s[0]; /* INPUT + 4 * 4 */
273 dup VTMP3.4s, VTMP1.s[1]; /* INPUT + 5 * 4 */
274 dup X12_TMP.4s, VTMP1.s[2]; /* INPUT + 6 * 4 */
275 dup X13_TMP.4s, VTMP1.s[3]; /* INPUT + 7 * 4 */
276 ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS];
277 mov INPUT_POS, INPUT;
278 PLUS(X4, VTMP2);
279 PLUS(X5, VTMP3);
280 PLUS(X6, X12_TMP);
281 PLUS(X7, X13_TMP);
282
283 dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 8 * 4 */
284 dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 9 * 4 */
285 dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 10 * 4 */
286 dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 11 * 4 */
287 dup VTMP0.4s, VTMP1.s[2]; /* INPUT + 14 * 4 */
288 dup VTMP1.4s, VTMP1.s[3]; /* INPUT + 15 * 4 */
289 PLUS(X8, VTMP2);
290 PLUS(X9, VTMP3);
291 PLUS(X10, X12_TMP);
292 PLUS(X11, X13_TMP);
293 PLUS(X14, VTMP0);
294 PLUS(X15, VTMP1);
295
296 transpose_4x4(X0, X1, X2, X3, VTMP0, VTMP1, VTMP2);
297 transpose_4x4(X4, X5, X6, X7, VTMP0, VTMP1, VTMP2);
298 transpose_4x4(X8, X9, X10, X11, VTMP0, VTMP1, VTMP2);
299 transpose_4x4(X12, X13, X14, X15, VTMP0, VTMP1, VTMP2);
300
301 subs NBLKS, NBLKS, #4;
302
303 st1 {X0.16b,X4.16B,X8.16b, X12.16b}, [DST], #64
304 st1 {X1.16b,X5.16b}, [DST], #32;
305 st1 {X9.16b, X13.16b, X2.16b, X6.16b}, [DST], #64
306 st1 {X10.16b,X14.16b}, [DST], #32;
307 st1 {X3.16b, X7.16b, X11.16b, X15.16b}, [DST], #64;
308
309 b.ne L(loop4);
310
311 ret_spec_stop
312 END (__chacha20_neon_blocks4)
313
314 #endif