]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/chacha20-amd64-sse2.S
351a1109c65aaf507edbccad1474d77694d57831
[thirdparty/glibc.git] / sysdeps / x86_64 / chacha20-amd64-sse2.S
1 /* Optimized SSE2 implementation of ChaCha20 cipher.
2 Copyright (C) 2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 /* chacha20-amd64-ssse3.S - SSSE3 implementation of ChaCha20 cipher
20
21 Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
22
23 This file is part of Libgcrypt.
24
25 Libgcrypt is free software; you can redistribute it and/or modify
26 it under the terms of the GNU Lesser General Public License as
27 published by the Free Software Foundation; either version 2.1 of
28 the License, or (at your option) any later version.
29
30 Libgcrypt is distributed in the hope that it will be useful,
31 but WITHOUT ANY WARRANTY; without even the implied warranty of
32 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 GNU Lesser General Public License for more details.
34
35 You should have received a copy of the GNU Lesser General Public
36 License along with this program; if not, see <https://www.gnu.org/licenses/>.
37 */
38
39 /* Based on D. J. Bernstein reference implementation at
40 http://cr.yp.to/chacha.html:
41
42 chacha-regs.c version 20080118
43 D. J. Bernstein
44 Public domain. */
45
46 #include <sysdep.h>
47 #include <isa-level.h>
48
49 #if MINIMUM_X86_ISA_LEVEL <= 2
50
51 #ifdef PIC
52 # define rRIP (%rip)
53 #else
54 # define rRIP
55 #endif
56
57 /* 'ret' instruction replacement for straight-line speculation mitigation */
58 #define ret_spec_stop \
59 ret; int3;
60
61 /* register macros */
62 #define INPUT %rdi
63 #define DST %rsi
64 #define SRC %rdx
65 #define NBLKS %rcx
66 #define ROUND %eax
67
68 /* stack structure */
69 #define STACK_VEC_X12 (16)
70 #define STACK_VEC_X13 (16 + STACK_VEC_X12)
71 #define STACK_TMP (16 + STACK_VEC_X13)
72 #define STACK_TMP1 (16 + STACK_TMP)
73 #define STACK_TMP2 (16 + STACK_TMP1)
74
75 #define STACK_MAX (16 + STACK_TMP2)
76
77 /* vector registers */
78 #define X0 %xmm0
79 #define X1 %xmm1
80 #define X2 %xmm2
81 #define X3 %xmm3
82 #define X4 %xmm4
83 #define X5 %xmm5
84 #define X6 %xmm6
85 #define X7 %xmm7
86 #define X8 %xmm8
87 #define X9 %xmm9
88 #define X10 %xmm10
89 #define X11 %xmm11
90 #define X12 %xmm12
91 #define X13 %xmm13
92 #define X14 %xmm14
93 #define X15 %xmm15
94
95 /**********************************************************************
96 helper macros
97 **********************************************************************/
98
99 /* 4x4 32-bit integer matrix transpose */
100 #define TRANSPOSE_4x4(x0, x1, x2, x3, t1, t2, t3) \
101 movdqa x0, t2; \
102 punpckhdq x1, t2; \
103 punpckldq x1, x0; \
104 \
105 movdqa x2, t1; \
106 punpckldq x3, t1; \
107 punpckhdq x3, x2; \
108 \
109 movdqa x0, x1; \
110 punpckhqdq t1, x1; \
111 punpcklqdq t1, x0; \
112 \
113 movdqa t2, x3; \
114 punpckhqdq x2, x3; \
115 punpcklqdq x2, t2; \
116 movdqa t2, x2;
117
118 /* fill xmm register with 32-bit value from memory */
119 #define PBROADCASTD(mem32, xreg) \
120 movd mem32, xreg; \
121 pshufd $0, xreg, xreg;
122
123 /**********************************************************************
124 4-way chacha20
125 **********************************************************************/
126
127 #define ROTATE2(v1,v2,c,tmp1,tmp2) \
128 movdqa v1, tmp1; \
129 movdqa v2, tmp2; \
130 psrld $(32 - (c)), v1; \
131 pslld $(c), tmp1; \
132 paddb tmp1, v1; \
133 psrld $(32 - (c)), v2; \
134 pslld $(c), tmp2; \
135 paddb tmp2, v2;
136
137 #define XOR(ds,s) \
138 pxor s, ds;
139
140 #define PLUS(ds,s) \
141 paddd s, ds;
142
143 #define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2) \
144 PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
145 ROTATE2(d1, d2, 16, tmp1, tmp2); \
146 PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
147 ROTATE2(b1, b2, 12, tmp1, tmp2); \
148 PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
149 ROTATE2(d1, d2, 8, tmp1, tmp2); \
150 PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
151 ROTATE2(b1, b2, 7, tmp1, tmp2);
152
153 .section .text.sse2,"ax",@progbits
154
155 chacha20_data:
156 .align 16
157 L(counter1):
158 .long 1,0,0,0
159 L(inc_counter):
160 .long 0,1,2,3
161 L(unsigned_cmp):
162 .long 0x80000000,0x80000000,0x80000000,0x80000000
163
164 .hidden __chacha20_sse2_blocks4
165 ENTRY (__chacha20_sse2_blocks4)
166 /* input:
167 * %rdi: input
168 * %rsi: dst
169 * %rdx: src
170 * %rcx: nblks (multiple of 4)
171 */
172
173 pushq %rbp;
174 cfi_adjust_cfa_offset(8);
175 cfi_rel_offset(rbp, 0)
176 movq %rsp, %rbp;
177 cfi_def_cfa_register(%rbp);
178
179 subq $STACK_MAX, %rsp;
180 andq $~15, %rsp;
181
182 L(loop4):
183 mov $20, ROUND;
184
185 /* Construct counter vectors X12 and X13 */
186 movdqa L(inc_counter) rRIP, X0;
187 movdqa L(unsigned_cmp) rRIP, X2;
188 PBROADCASTD((12 * 4)(INPUT), X12);
189 PBROADCASTD((13 * 4)(INPUT), X13);
190 paddd X0, X12;
191 movdqa X12, X1;
192 pxor X2, X0;
193 pxor X2, X1;
194 pcmpgtd X1, X0;
195 psubd X0, X13;
196 movdqa X12, (STACK_VEC_X12)(%rsp);
197 movdqa X13, (STACK_VEC_X13)(%rsp);
198
199 /* Load vectors */
200 PBROADCASTD((0 * 4)(INPUT), X0);
201 PBROADCASTD((1 * 4)(INPUT), X1);
202 PBROADCASTD((2 * 4)(INPUT), X2);
203 PBROADCASTD((3 * 4)(INPUT), X3);
204 PBROADCASTD((4 * 4)(INPUT), X4);
205 PBROADCASTD((5 * 4)(INPUT), X5);
206 PBROADCASTD((6 * 4)(INPUT), X6);
207 PBROADCASTD((7 * 4)(INPUT), X7);
208 PBROADCASTD((8 * 4)(INPUT), X8);
209 PBROADCASTD((9 * 4)(INPUT), X9);
210 PBROADCASTD((10 * 4)(INPUT), X10);
211 PBROADCASTD((11 * 4)(INPUT), X11);
212 PBROADCASTD((14 * 4)(INPUT), X14);
213 PBROADCASTD((15 * 4)(INPUT), X15);
214 movdqa X11, (STACK_TMP)(%rsp);
215 movdqa X15, (STACK_TMP1)(%rsp);
216
217 L(round2_4):
218 QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15)
219 movdqa (STACK_TMP)(%rsp), X11;
220 movdqa (STACK_TMP1)(%rsp), X15;
221 movdqa X8, (STACK_TMP)(%rsp);
222 movdqa X9, (STACK_TMP1)(%rsp);
223 QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9)
224 QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9)
225 movdqa (STACK_TMP)(%rsp), X8;
226 movdqa (STACK_TMP1)(%rsp), X9;
227 movdqa X11, (STACK_TMP)(%rsp);
228 movdqa X15, (STACK_TMP1)(%rsp);
229 QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15)
230 sub $2, ROUND;
231 jnz L(round2_4);
232
233 /* tmp := X15 */
234 movdqa (STACK_TMP)(%rsp), X11;
235 PBROADCASTD((0 * 4)(INPUT), X15);
236 PLUS(X0, X15);
237 PBROADCASTD((1 * 4)(INPUT), X15);
238 PLUS(X1, X15);
239 PBROADCASTD((2 * 4)(INPUT), X15);
240 PLUS(X2, X15);
241 PBROADCASTD((3 * 4)(INPUT), X15);
242 PLUS(X3, X15);
243 PBROADCASTD((4 * 4)(INPUT), X15);
244 PLUS(X4, X15);
245 PBROADCASTD((5 * 4)(INPUT), X15);
246 PLUS(X5, X15);
247 PBROADCASTD((6 * 4)(INPUT), X15);
248 PLUS(X6, X15);
249 PBROADCASTD((7 * 4)(INPUT), X15);
250 PLUS(X7, X15);
251 PBROADCASTD((8 * 4)(INPUT), X15);
252 PLUS(X8, X15);
253 PBROADCASTD((9 * 4)(INPUT), X15);
254 PLUS(X9, X15);
255 PBROADCASTD((10 * 4)(INPUT), X15);
256 PLUS(X10, X15);
257 PBROADCASTD((11 * 4)(INPUT), X15);
258 PLUS(X11, X15);
259 movdqa (STACK_VEC_X12)(%rsp), X15;
260 PLUS(X12, X15);
261 movdqa (STACK_VEC_X13)(%rsp), X15;
262 PLUS(X13, X15);
263 movdqa X13, (STACK_TMP)(%rsp);
264 PBROADCASTD((14 * 4)(INPUT), X15);
265 PLUS(X14, X15);
266 movdqa (STACK_TMP1)(%rsp), X15;
267 movdqa X14, (STACK_TMP1)(%rsp);
268 PBROADCASTD((15 * 4)(INPUT), X13);
269 PLUS(X15, X13);
270 movdqa X15, (STACK_TMP2)(%rsp);
271
272 /* Update counter */
273 addq $4, (12 * 4)(INPUT);
274
275 TRANSPOSE_4x4(X0, X1, X2, X3, X13, X14, X15);
276 movdqu X0, (64 * 0 + 16 * 0)(DST)
277 movdqu X1, (64 * 1 + 16 * 0)(DST)
278 movdqu X2, (64 * 2 + 16 * 0)(DST)
279 movdqu X3, (64 * 3 + 16 * 0)(DST)
280 TRANSPOSE_4x4(X4, X5, X6, X7, X0, X1, X2);
281 movdqa (STACK_TMP)(%rsp), X13;
282 movdqa (STACK_TMP1)(%rsp), X14;
283 movdqa (STACK_TMP2)(%rsp), X15;
284 movdqu X4, (64 * 0 + 16 * 1)(DST)
285 movdqu X5, (64 * 1 + 16 * 1)(DST)
286 movdqu X6, (64 * 2 + 16 * 1)(DST)
287 movdqu X7, (64 * 3 + 16 * 1)(DST)
288 TRANSPOSE_4x4(X8, X9, X10, X11, X0, X1, X2);
289 movdqu X8, (64 * 0 + 16 * 2)(DST)
290 movdqu X9, (64 * 1 + 16 * 2)(DST)
291 movdqu X10, (64 * 2 + 16 * 2)(DST)
292 movdqu X11, (64 * 3 + 16 * 2)(DST)
293 TRANSPOSE_4x4(X12, X13, X14, X15, X0, X1, X2);
294 movdqu X12, (64 * 0 + 16 * 3)(DST)
295 movdqu X13, (64 * 1 + 16 * 3)(DST)
296 movdqu X14, (64 * 2 + 16 * 3)(DST)
297 movdqu X15, (64 * 3 + 16 * 3)(DST)
298
299 sub $4, NBLKS;
300 lea (4 * 64)(DST), DST;
301 lea (4 * 64)(SRC), SRC;
302 jnz L(loop4);
303
304 /* eax zeroed by round loop. */
305 leave;
306 cfi_adjust_cfa_offset(-8)
307 cfi_def_cfa_register(%rsp);
308 ret_spec_stop;
309 END (__chacha20_sse2_blocks4)
310
311 #endif /* if MINIMUM_X86_ISA_LEVEL <= 2 */